diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000000..5c34ec47cf
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,37 @@
+{
+    "name": "gem5 Development Container",
+    "image": "ghcr.io/gem5/devcontainer:latest",
+    "hostRequirements": {
+        "cpus": 8,
+        "memory": "16gb",
+        "storage": "32gb"
+     },
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "eamodio.gitlens",
+                "GitHub.copilot",
+                "GitHub.copilot-chat",
+                "GitHub.vscode-pull-request-github",
+                "ms-python.debugpy",
+                "ms-python.isort",
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "ms-vscode.cpptools",
+                "ms-vscode.cpptools-extension-pack",
+                "ms-vscode.cpptools-themes",
+                "ms-vscode.makefile-tools",
+                "ms-vscode-remote.remote-containers",
+                "Tsinghua-Hexin-Joint-Institute.gem5-slicc",
+                "VisualStudioExptTeam.vscodeintellicode"
+            ]
+        }
+    },
+    "features": {
+        "ghcr.io/devcontainers/features/docker-in-docker:2": {},
+        "ghcr.io/devcontainers/features/github-cli:1": {},
+        "ghcr.io/devcontainers-contrib/features/actionlint:1": {},
+        "ghcr.io/devcontainers-contrib/features/vscode-cli:1": {}
+    },
+    "onCreateCommand": "./.devcontainer/on-create.sh"
+}
diff --git a/.devcontainer/on-create.sh b/.devcontainer/on-create.sh
new file mode 100755
index 0000000000..77f642c32b
--- /dev/null
+++ b/.devcontainer/on-create.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright (c) 2024 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This script is run when the Docker container specified in devcontainer.json
+# is created.
+
+set -e
+
+# Refresh the git index.
+git update-index
+
+# Install the pre-commit checks.
+./util/pre-commit-install.sh
diff --git a/.gitignore b/.gitignore
index d1904756d2..36ba603fb6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,4 @@ configs/dram/lowp_sweep.cfg
 .pyenv
 .vscode
 typings
+.DS_Store
diff --git a/.mailmap b/.mailmap
index 49c438d3eb..1ce4d098c6 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1,8 +1,11 @@
 Abdul Mutaal Ahmad <abdul.mutaal@gmail.com>
 adarshpatil <adarshpatil123@gmail.com>
+Aditya K Kamath <a_kamath@hotmail.com> aditya <a_kamath@hotmail.com>
 Adrià Armejach <adria.armejach@bsc.es> Adrià Armejach <adria.armejach@gmail.com>
+Adrià Armejach <adria.armejach@bsc.es> Adrià Armejach <66964292+aarmejach@users.noreply.github.com>
 Adrian Herrera <adrian.herrera@arm.com>
 Adrien Pesle <adrien.pesle@arm.com>
+Adwaith R Krishna <adwaithrk19@gmail.com>
 Akash Bagdia <akash.bagdia@ARM.com> Akash Bagdia <akash.bagdia@arm.com>
 Alec Roelke <alec.roelke@gmail.com> Alec Roelke <ar4jc@virginia.edu>
 Alexander Klimov <Alexander.Klimov@arm.com>
@@ -10,21 +13,19 @@ Alexandru Dutu <alexandru.dutu@amd.com> Alexandru <alexandru.dutu@amd.com>
 Alex Richardson <alexrichardson@google.com>
 Ali Jafri <ali.jafri@arm.com>
 Ali Saidi <Ali.Saidi@arm.com> Ali Saidi <ali.saidi@arm.com>
-Ali Saidi <Ali.Saidi@arm.com> Ali Saidi <Ali.Saidi@ARM.com>
 Ali Saidi <Ali.Saidi@arm.com> Ali Saidi <saidi@eecs.umich.edu>
 Alistair Delva <adelva@google.com>
+Alvaro Moreno <alvaro.moreno@bsc.es>
 Amin Farmahini <aminfar@gmail.com>
 Anders Handler <s052838@student.dtu.dk>
 Andrea Mondelli <andrea.mondelli@huawei.com> Andrea Mondelli <andrea.mondelli@ucf.edu>
-Andrea Mondelli <andrea.mondelli@huawei.com> Andrea Mondelli <Andrea.Mondelli@ucf.edu>
 Andrea Pellegrini <andrea.pellegrini@gmail.com>
 Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <andreas.hansson>
 Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <andreas.hansson@arm.com>
-Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <Andreas.Hansson@ARM.com>
 Andreas Hansson <andreas.hanson@arm.com> Andreas Hansson <andreas.hansson@armm.com>
 Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <andreas.sandberg@arm.com>
-Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <Andreas.Sandberg@ARM.com>
 Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <andreas@sandberg.pp.se>
+Andreas Sandberg <Andreas.Sandberg@arm.com> Andreas Sandberg <andreas@sandberg.uk>
 Andrew Bardsley <Andrew.Bardsley@arm.com> Andrew Bardsley <Andreas.Bardsley@arm.com>
 Andrew Lukefahr <lukefahr@umich.edu>
 Andrew Schultz <alschult@umich.edu>
@@ -32,11 +33,14 @@ Andriani Mappoura <andriani.mappoura@arm.com>
 Angie Lee <peiyinglee@google.com>
 Anis Peysieux <anis.peysieux@inria.fr>
 Ani Udipi <ani.udipi@arm.com>
+anoop <mysanoop@gmail.com>
 Anouk Van Laer <anouk.vanlaer@arm.com>
 ARM gem5 Developers <none@none>
 Arthur Perais <Arthur.Perais@univ-grenoble-alpes.fr> Arthur Perais <arthur.perais@inria.fr>
 Arun Rodrigues <afrodri@gmail.com>
 Ashkan Tousi <ashkan.tousimojarad@arm.com>
+atrah22 <atul.rahman@outlook.com>
+Atri Bhattacharyya <atri.bhattacharyya@epfl.ch>
 Austin Harris <austinharris@utexas.edu> Austin Harris <mail@austin-harris.com>
 Avishai Tvila <avishai.tvila@gmail.com>
 Ayaz Akram <yazakram@ucdavis.edu>
@@ -48,6 +52,7 @@ Bjoern A. Zeeb <baz21@cam.ac.uk>
 Blake Hechtman <bah13@duke.edu> Blake Hechtman <blake.hechtman@amd.com>
 Blake Hechtman <bah13@duke.edu> Blake Hechtman ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <bah13@duke.edu>
 Bobby R. Bruce <bbruce@ucdavis.edu> Bobby Bruce <bbruce@amarillo.cs.ucdavis.edu>
+Bobby R. Bruce <bbruce@ucdavis.edu> Bobby Bruce <bbruce@ucdavis.edu>
 Boris Shingarov <shingarov@gmail.com> Boris Shingarov <shingarov@labware.com>
 Brad Beckmann <brad.beckmann@amd.com> Brad Beckmann <Brad.Beckmann@amd.com>
 Brad Beckmann <brad.beckmann@amd.com> Brad Beckmann ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <Brad.Beckmann@amd.com>
@@ -60,15 +65,13 @@ Brian Grayson <b.grayson@samsung.com>
 Cagdas Dirik <cdirik@micron.com> cdirik <cdirik@micron.com>
 Carlos Falquez <c.falquez@fz-juelich.de>
 Chander Sudanthi <chander.sudanthi@arm.com> Chander Sudanthi <Chander.Sudanthi@arm.com>
-Chander Sudanthi <chander.sudanthi@arm.com> Chander Sudanthi <Chander.Sudanthi@ARM.com>
 Charles Jamieson <cjamieson2@wisc.edu>
-CHEN Meng <tundriolaxy@gmail.com>
+Chen Meng <tundriolaxy@gmail.com>
 Chen Zou <chenzou@uchicago.edu>
 Chia-You Chen <hortune@google.com>
-Chow, Marcus <marcus.chow@amd.com>
+Marcus Chow <marcus.chow@amd.com>
 Chris Adeniyi-Jones <Chris.Adeniyi-Jones@arm.com>
 Chris Emmons <chris.emmons@arm.com> Chris Emmons <Chris.Emmons@arm.com>
-Chris Emmons <chris.emmons@arm.com> Chris Emmons <Chris.Emmons@ARM.com>
 Chris January <chris.january@arm.com>
 Christian Menard <christian.menard@tu-dresden.de> Christian Menard <Christian.Menard@tu-dresden.de>
 Christopher Torng <clt67@cornell.edu>
@@ -83,17 +86,19 @@ Daecheol You <daecheol.you@samsung.com>
 Dam Sunwoo <dam.sunwoo@arm.com>
 Dan Gibson <gibson@cs.wisc.edu>
 Daniel Carvalho <odanrc@yahoo.com.br> Daniel <odanrc@yahoo.com.br>
+Daniel Carvalho <odanrc@yahoo.com.br> Daniel Carvalho <odanrc@users.noreply.github.com>
 Daniel Carvalho <odanrc@yahoo.com.br> Daniel R. Carvalho <odanrc@yahoo.com.br>
 Daniel Gerzhoy <daniel.gerzhoy@gmail.com>
 Daniel Johnson <daniel.johnson@arm.com>
+Daniel Kouchekinia <DanKouch@users.noreply.github.com>
 Daniel Sanchez <sanchezd@stanford.edu>
 Davide Basilio Bartolini <davide.basilio.bartolini@huawei.com>
 David Guillen-Fandos <david.guillen@arm.com> David Guillen <david.guillen@arm.com>
 David Guillen-Fandos <david.guillen@arm.com> David Guillen Fandos <david.guillen@arm.com>
 David Hashe <david.hashe@amd.com> David Hashe <david.j.hashe@gmail.com>
 David Oehmke <doehmke@umich.edu>
-David Schall <david.schall2@arm.com>
-Derek Christ <dchrist@rhrk.uni-kl.de>
+David Schall <david.schall@ed.ac.uk> David Schall <david.schall2@arm.com>
+Derek Christ <dchrist@rhrk.uni-kl.de> Derek Christ <44267643+derchr@users.noreply.github.com>
 Derek Hower <drh5@cs.wisc.edu>
 Deyaun Guo <guodeyuan@tsinghua.org.cn> Deyuan Guo ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <guodeyuan@tsinghua.org.cn>
 Deyaun Guo <guodeyuan@tsinghua.org.cn> Deyuan Guo <guodeyuan@tsinghua.org.cn>
@@ -107,11 +112,12 @@ Earl Ou <shunhsingou@google.com>
 eavivi <eavivi@ucdavis.edu>
 Éder F. Zulian <zulian@eit.uni-kl.de>
 Edmund Grimley Evans <Edmund.Grimley-Evans@arm.com>
-Eduardo José Gómez Hernández <eduardojose.gomez@um.es>
+Eduardo José Gómez Hernández <eduardojose.gomez@um.es> Eduardo José Gómez Hernández <git@edujgh.net>
 Eliot Moss <moss@cs.umass.edu>
 Emilio Castillo <castilloe@unican.es> Emilio Castillo <ecastill@bsc.es>
 Emilio Castillo <castilloe@unican.es> Emilio Castillo ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <castilloe@unican.es>
 Emily Brickey <esbrickey@ucdavis.edu>
+Emin Gadzhiev <e.gadzhiev.mhk@gmail.com>
 Erfan Azarkhish <erfan.azarkhish@unibo.it>
 Erhu <fengerhu.ipads@gmail.com>
 Eric Van Hensbergen <eric.vanhensbergen@arm.com> Eric Van Hensbergen <Eric.VanHensbergen@ARM.com>
@@ -125,11 +131,12 @@ Gabe Black <gabe.black@gmail.com> Gabe Black <gabeblack@google.com>
 Gabe Black <gabe.black@gmail.com> Gabe Black <gblack@eecs.umich.edu>
 Gabe Loh <gabriel.loh@amd.com> gloh <none@none>
 Gabor Dozsa <gabor.dozsa@arm.com>
-Gabriel Busnot <gabriel.busnot@arteris.com>
+Gabriel Busnot <gabriel.busnot@arteris.com> Gabriel Busnot <gabriel.busnot@cea.fr>
+Gabriel Busnot <gabriel.busnot@arteris.com> Gabriel Busnot <gabibusnot@gmail.com>
 gauravjain14 <gjain6@wisc.edu>
+Gautham Pathak <gspathak@gitlab.uwaterloo.ca>
 Gedare Bloom <gedare@rtems.org> Gedare Bloom <gedare@gwmail.gwu.edu>
 Gene Wu <gene.wu@arm.com> Gene WU <gene.wu@arm.com>
-Gene WU <gene.wu@arm.com> Gene Wu <Gene.Wu@arm.com>
 Geoffrey Blake <geoffrey.blake@arm.com> Geoffrey Blake <blakeg@umich.edu>
 Geoffrey Blake <geoffrey.blake@arm.com> Geoffrey Blake <Geoffrey.Blake@arm.com>
 Georg Kotheimer <georg.kotheimer@mailbox.tu-dresden.de>
@@ -140,10 +147,14 @@ GWDx <gwdx@mail.ustc.edu.cn>
 Hamid Reza Khaleghzadeh <khaleghzadeh@gmail.com> Hamid Reza Khaleghzadeh ext:(%2C%20Lluc%20Alvarez%20%3Clluc.alvarez%40bsc.es%3E%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E) <khaleghzadeh@gmail.com>
 handsomeliu <handsomeliu@google.com>
 Hanhwi Jang <jang.hanhwi@gmail.com>
-Hoa Nguyen <hoanguyen@ucdavis.edu>
+Harshil Patel <hpppatel@ucdavis.edu> Harshil Patel <harshilp2107@gmail.com>
+Harshil Patel <hpppatel@ucdavis.edu> Harshil Patel <91860903+Harshil2107@users.noreply.github.com>
+Wenjian He <wheac@connect.ust.hk>
+HJikram <humzajahangirikram@gmail.com>
+Hoa Nguyen <hn@hnpl.org> Hoa Nguyen <hoanguyen@ucdavis.edu>
 Hongil Yoon <ongal@cs.wisc.edu>
 Hsuan Hsu <hsuan.hsu@mediatek.com>
-huangjs <jiasen.hjs@alibaba-inc.com>
+hungweihsu <hungweihsu@google.com> hungweihsuG <145444687+hungweihsuG@users.noreply.github.com>
 Hussein Elnawawy <hussein.elnawawy@gmail.com>
 Ian Jiang <ianjiang.ict@gmail.com>
 IanJiangICT <ianjiang.ict@gmail.com>
@@ -152,9 +163,13 @@ Iru Cai <mytbk920423@gmail.com>
 Isaac Richter <isaac.richter@rochester.edu>
 Isaac Sánchez Barrera <isaac.sanchez@bsc.es>
 Ivan Pizarro <ivan.pizarro@metempsy.com>
-Jack Whitham <jack-m5ml2@cs.york.ac.uk> Jack Whitman <jack-m5ml2@cs.york.ac.uk>
+Ivan Turasov <turasov.ivan@gmail.com>
+Ivana Mitrovic <imitrovic@ucdavis.edu> Ivana Mitrovic <ivanamit91@gmail.com>
+Ivana Mitrovic <imitrovic@ucdavis.edu> ivanaamit <ivanamit91@gmail.com>
+Jack Whitham <jack-m5ml2@cs.york.ac.uk>
 Jairo Balart <jairo.balart@metempsy.com>
 Jakub Jermar <jakub@jermar.eu>
+James Braun <jebraun3@wisc.edu>
 James Clarkson <james.clarkson@arm.com>
 Jan-Peter Larsson <jan-peter.larsson@arm.com>
 Jan Vrany <jan.vrany@labware.com>
@@ -174,8 +189,8 @@ Jayneel Gandhi <jayneel@cs.wisc.edu>
 Jennifer Treichler <jtreichl@umich.edu>
 Jerin Joy <joy@rivosinc.com>
 Jiajie Chen <c@jia.je>
-Jiasen Huang <jiasen.hjs@alibaba-inc.com>
-Jiasen <jiasen.hjs@alibaba-inc.com>
+Jiasen Huang <jiasen.hjs@alibaba-inc.com> Jiasen <jiasen.hjs@alibaba-inc.com>
+Jiasen Huang <jiasen.hjs@alibaba-inc.com> huangjs <jiasen.hjs@alibaba-inc.com>
 Jiayi Huang <jyhuang91@gmail.com>
 jiegec <noc@jiegec.ac.cn>
 Jieming Yin <jieming.yin@amd.com> jiemingyin <bjm419@gmail.com>
@@ -188,14 +203,17 @@ Joel Hestness <jthestness@gmail.com> Joel Hestness <hestness@cs.wisc.edu>
 Joël Porquet-Lupine <joel@porquet.org>
 John Alsop <johnathan.alsop@amd.com>
 John Kalamatianos <john.kalamatianos@amd.com> jkalamat <john.kalamatianos@amd.com>
+Johnny <johnnyko@google.com>
 Jordi Vaquero <jordi.vaquero@metempsy.com>
 Jose Marinho <jose.marinho@arm.com>
 Juan M. Cebrian <jm.cebriangonzalez@gmail.com>
 Jui-min Lee <fcrh@google.com>
-kai.ren <kai.ren@streamcomputing.com> Kai Ren <binarystar2006@outlook.com>
+Kai Ren <kai.ren@streamcomputing.com> kai.ren <kai.ren@streamcomputing.com>
+Kai Ren <kai.ren@streamcomputing.com> Kai Ren <binarystar2006@outlook.com>
+KaiBatley <68886332+KaiBatley@users.noreply.github.com>
 Kanishk Sugand <kanishk.sugand@arm.com>
 Karthik Sangaiah <karthik.sangaiah@arm.com>
-Kaustav Goswami <kggoswami@ucdavis.edu>
+Kaustav Goswami <kggoswami@ucdavis.edu> Kaustav Goswami <39310478+kaustav-goswami@users.noreply.github.com>
 Kelly Nguyen <klynguyen@ucdavis.edu>
 Ke Meng <mengke97@hotmail.com>
 Kevin Brodsky <kevin.brodsky@arm.com>
@@ -206,11 +224,16 @@ Koan-Sin Tan <koansin.tan@gmail.com>
 Korey Sewell <ksewell@umich.edu>
 Krishnendra Nathella <Krishnendra.Nathella@arm.com> Krishnendra Nathella <krinat01@arm.com>
 ksco <numbksco@gmail.com>
-kunpai <kunpai@ucdavis.edu>
+Kunal Pai <kunpai@ucdavis.edu> Kunal Pai <62979320+kunpai@users.noreply.github.com>
+Kunal Pai <kunpai@ucdavis.edu> kunpai <kunpai@ucdavis.edu>
+Kunal Pai <kunpai@ucdavis.edu> paikunal <kunpai@ucdavis.edu>
+Kunal Pai <kunpai@ucdavis.edu> KUNAL PAI <kunpai@ucdavis.edu>
 Kyle Roarty <kyleroarty1716@gmail.com> Kyle Roarty <Kyle.Roarty@amd.com>
 Laura Hinman <llhinman@ucdavis.edu>
 Lena Olson <leolson@google.com> Lena Olson <lena@cs.wisc,edu>
 Lena Olson <leolson@google.com> Lena Olson <lena@cs.wisc.edu>
+Leo Redivo <lredivo@ucdavis.edu> leoredivo <94771718+leoredivo@users.noreply.github.com>
+Lingkang <karlzhu12@gmail.com>
 Lisa Hsu <Lisa.Hsu@amd.com> Lisa Hsu <hsul@eecs.umich.edu>
 Lluc Alvarez <lluc.alvarez@bsc.es>
 Lluís Vilanova <vilanova@ac.upc.edu> Lluis Vilanova <vilanova@ac.upc.edu>
@@ -221,9 +244,11 @@ Mahyar Samani <msamani@ucdavis.edu>
 Majid Jalili <majid0jalili@gmail.com>
 Malek Musleh <malek.musleh@gmail.com> Nilay Vaish ext:(%2C%20Malek%20Musleh%20%3Cmalek.musleh%40gmail.com%3E) <nilay@cs.wisc.edu>
 Marc Mari Barcelo <marc.maribarcelo@arm.com>
-Marco Balboni <Marco.Balboni@ARM.com>
-Marco Elver <Marco.Elver@ARM.com> Marco Elver <marco.elver@ed.ac.uk>
 Marc Orr <marc.orr@gmail.com> Marc Orr <morr@cs.wisc.edu>
+Marco Balboni <Marco.Balboni@ARM.com>
+Marco Chen <mc@soc.pub>
+Marco Elver <Marco.Elver@ARM.com> Marco Elver <marco.elver@ed.ac.uk>
+Marco Kurzynski <marcokurzynski@icloud.com>
 Marjan Fariborz <mfariborz@ucdavis.edu> marjanfariborz <mfariborz@ucdavis.edu>
 Mark Hildebrand <mhildebrand@ucdavis.edu>
 Marton Erdos <marton.erdos@arm.com>
@@ -233,20 +258,18 @@ Matteo Andreozzi <matteo.andreozzi@arm.com> Matteo Andreozzi <Matteo.Andreozzi@a
 Matteo M. Fusi <matteo.fusi@bsc.es>
 Matt Evans <matt.evans@arm.com> Matt Evans <Matt.Evans@arm.com>
 Matthew Poremba <matthew.poremba@amd.com> Matthew Poremba <Matthew.Poremba@amd.com>
+Matthias Boettcher <matthias.boettcher@arm.com>
 Matthias Hille <matthiashille8@gmail.com>
-Matthias Jung <jungma@eit.uni-kl.de>
-Matthias Jung <matthias.jung@iese.fraunhofer.de>
-Matt Horsnell <matt.horsnell@arm.com> Matt Horsnell <matt.horsnell@ARM.com>
+Matthias Jung <matthias.jung@iese.fraunhofer.de> Matthias Jung <jungma@eit.uni-kl.de>
 Matt Horsnell <matt.horsnell@arm.com> Matt Horsnell <Matt.Horsnell@arm.com>
-Matt Horsnell <matt.horsnell@arm.com>Matt Horsnell <Matt.Horsnell@ARM.com>
 Matt Poremba <matthew.poremba@amd.com> Matt Poremba <Matthew.Poremba@amd.com>
-Matt Sinclair <mattdsinclair@gmail.com> Matthew Sinclair <matthew.sinclair@amd.com>
-Matt Sinclair <mattdsinclair.wisc@gmail.com> Matt Sinclair <Matthew.Sinclair@amd.com>
+Matt Sinclair <mattdsinclair.wisc@gmail.com> Matt Sinclair <mattdsinclair@gmail.com>
+Matt Sinclair <mattdsinclair.wisc@gmail.com> Matthew Sinclair <matthew.sinclair@amd.com>
 Maurice Becker <madnaurice@googlemail.com>
 Maxime Martinasso <maxime.cscs@gmail.com>
 Maximilian Stein <maximilian.stein@tu-dresden.de>Maximilian Stein <m@steiny.biz>
 Maximilien Breughe <maximilien.breughe@elis.ugent.be> Maximilien Breughe <Maximilien.Breughe@elis.ugent.be>
-Melissa Jost <melissakjost@gmail.com>
+Melissa Jost <melissakjost@gmail.com> Melissa Jost <50555529+mkjost0@users.noreply.github.com>
 Michael Adler <Michael.Adler@intel.com>
 Michael Boyer <Michael.Boyer@amd.com>
 Michael LeBeane <michael.lebeane@amd.com> Michael LeBeane <Michael.Lebeane@amd.com>
@@ -262,7 +285,6 @@ Min Kyu Jeong <minkyu.jeong@arm.com> Min Kyu Jeong <MinKyu.Jeong@arm.com>
 Mitch Hayenga <mitch.hayenga@arm.com> Mitchell Hayenga <Mitchell.Hayenga@ARM.com>
 Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga ext:(%2C%20Amin%20Farmahini%20%3Caminfar%40gmail.com%3E) <mitch.hayenga+gem5@gmail.com>
 Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga <Mitch.Hayenga@arm.com>
-Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga <Mitch.Hayenga@ARM.com>
 Mitch Hayenga <mitch.hayenga@arm.com> Mitch Hayenga <mitch.hayenga+gem5@gmail.com>
 Mohammad Alian <m.alian1369@gmail.com>
 Monir Mozumder <monir.mozumder@amd.com>
@@ -279,13 +301,17 @@ Nathan Binkert <nate@binkert.org> Nathan Binkert <binkertn@umich.edu>
 Nayan Deshmukh <nayan26deshmukh@gmail.com>
 Neha Agarwal <neha.agarwal@arm.com>
 Neil Natekar <nanatekar@ucdavis.edu>
-Nicholas Lindsay <nicholas.lindsay@arm.com>
+Nicholas Lindsay <nicholas.lindsay@arm.com> Nicholas Lindsay <Nicholas.Lindsey@arm.com>
+Nicholas Mosier <nmosier@stanford.edu> Nicholas Mosier <nh.mosier@gmail.com>
 Nicolas Boichat <drinkcat@google.com>
 Nicolas Derumigny <nderumigny@gmail.com>
 Nicolas Zea <nicolas.zea@gmail.com>
+Nikolaos Kyparissas <nikolaos.kyparissas@arm.com>
 Nikos Nikoleris <nikos.nikoleris@arm.com> Nikos Nikoleris <nikos.nikoleris@gmail.com>
 Nilay Vaish ext:(%2C%20Timothy%20Jones%20%3Ctimothy.jones%40cl.cam.ac.uk%3E) <nilay@cs.wisc.edu>
 Nils Asmussen <nils.asmussen@barkhauseninstitut.org> Nils Asmussen <nilsasmussen7@gmail.com>
+Nitesh Narayana <nitesh.dps@gmail.com>
+Nitish Arya <42148385+aryanitish@users.noreply.github.com>
 Noah Katz <nkatz@rivosinc.com>
 ntampouratzis <ntampouratzis@isc.tuc.gr>
 Nuwan Jayasena <Nuwan.Jayasena@amd.com>
@@ -293,7 +319,6 @@ Ola Jeppsson <ola.jeppsson@gmail.com>
 Omar Naji <Omar.Naji@arm.com>
 Onur Kayiran <onur.kayiran@amd.com>
 Pablo Prieto <pablo.prieto@unican.es>
-paikunal <kunpai@ucdavis.edu>
 Palle Lyckegaard <palle@lyckegaard.dk>
 Pau Cabre <pau.cabre@metempsy.com>
 Paul Rosenfeld <prosenfeld@micron.com> Paul Rosenfeld <dramninjas@gmail.com>
@@ -308,29 +333,39 @@ Po-Hao Su <supohaosu@gmail.com>
 Polina Dudnik <pdudnik@cs.wisc.edu> Polina Dudnik <pdudnik@gmail.com>
 Polydoros Petrakis <ppetrak@ics.forth.gr>
 Pouya Fotouhi <pfotouhi@ucdavis.edu> Pouya Fotouhi <Pouya.Fotouhi@amd.com>
+Prajwal Hegde <prhegde@wisc.edu>
 Prakash Ramrakhyani <prakash.ramrakhyani@arm.com> Prakash Ramrakhani <Prakash.Ramrakhani@arm.com>
 Prakash Ramrakhyani <prakash.ramrakhyani@arm.com> Prakash Ramrakhyani <Prakash.Ramrakhyani@arm.com>
 Pritha Ghoshal <pritha9987@tamu.edu>
+Pu (Luke) Yi <lukeyi@stanford.edu>
 Quentin Forcioli <quentin.forcioli@telecom-paris.fr>
 Radhika Jagtap <radhika.jagtap@arm.com> Radhika Jagtap <radhika.jagtap@ARM.com>
 Rahul Thakur <rjthakur@google.com>
-Reiley Jeapaul <Reiley.Jeyapaul@arm.com>
+Rajarshi Das <drajarsh@gmail.com>
+Ranganath (Bujji) Selagamsetty <bujji.selagamsetty@amd.com> BujSet <ranganath1000@gmail.com>
+Razeza <borisov.dn@phystech.edu>
+Reiley Jeapaul <reiley.jeyapaul@arm.com> Reiley Jeapaul <Reiley.Jeyapaul@arm.com>
 Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai Gonzalez Alberquilla <rekai.gonzalezalberquilla@arm.com>
-Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai Gonzalez Alberquilla <Rekai.GonzalezAlberquilla@arm.com>
 Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai Gonzalez-Alberquilla <Rekai.GonzalezAlberquilla@arm.com>
 Rekai Gonzalez-Alberquilla <rekai.gonzalezalberquilla@arm.com> Rekai <Rekai.GonzalezAlberquilla@arm.com>
 Rene de Jong <rene.dejong@arm.com>
 Ricardo Alves <ricardo.alves@arm.com>
 Richard Cooper <richard.cooper@arm.com>
-Richard D. Strong <r.d.strong@gmail.com>
+Richard Strong <rstrong@hp.com> Richard D. Strong <r.d.strong@gmail.com>
 Richard Strong <rstrong@hp.com> Richard Strong <r.d.strong@gmail.com>
 Richard Strong <rstrong@hp.com> Richard Strong <rstrong@cs.ucsd.edu>
 Richard Strong <rstrong@hp.com> Rick Strong <rstrong@cs.ucsd.edu>
 Rico Amslinger <rico.amslinger@informatik.uni-augsburg.de>
 Riken Gohil <Riken.Gohil@arm.com>
 Rizwana Begum <rb639@drexel.edu>
+Robert Hauser <85344819+robhau@users.noreply.github.com>
 Robert Kovacsics <rmk35@cl.cam.ac.uk>
 Robert Scheffel <robert.scheffel1@tu-dresden.de> Robert <robert.scheffel1@tu-dresden.de>
+Rocky Tatiefo <rtatiefo@google.com>
+Roger Chang <rogerycchang@google.com> rogerchang23424 <rogerycchang@google.com>
+Roger Chang <rogerycchang@google.com> rogerchang23424 <32214817+rogerchang23424@users.noreply.github.com>
+Roger Chang <rogerycchang@google.com> rogerchang23424 <aucixw45876@gmail.com>
+Roger Chang <rogerycchang@google.com> Yu-Cheng Chang <rogerycchang@google.com>
 Rohit Kurup <rohit.kurup@arm.com>
 Ron Dreslinski <rdreslin@umich.edu> Ronald Dreslinski <rdreslin@umich.edu>
 Ruben Ayrapetyan <ruben.ayrapetyan@arm.com>
@@ -342,23 +377,21 @@ sacak32 <byrakocalan99@gmail.com>
 Sampad Mohapatra <sampad.mohapatra@gmail.com>
 Samuel Grayson <sam@samgrayson.me>
 Samuel Stark <samuel.stark2@arm.com>
-Sandipan Das <31861871+sandip4n@users.noreply.github.com>
 Sandipan Das <sandipan@linux.ibm.com> Sandipan Das <31861871+sandip4n@users.noreply.github.com>
 Santi Galan <santi.galan@metempsy.com>
 Sascha Bischoff <sascha.bischoff@arm.com> Sascha Bischoff <sascha.bischoff@ARM.com>
-Sascha Bischoff <sascha.bischoff@arm.com> Sascha Bischoff <Sascha.Bischoff@ARM.com>
+Saúl Adserias <33020671+saul44203@users.noreply.github.com>
 Sean McGoogan <Sean.McGoogan@arm.com>
 Sean Wilson <spwilson2@wisc.edu>
 Sergei Trofimov <sergei.trofimov@arm.com>
 Severin Wischmann <wiseveri@student.ethz.ch> Severin Wischmann ext:(%2C%20Ioannis%20Ilkos%20%3Cioannis.ilkos09%40imperial.ac.uk%3E) <wiseveri@student.ethz.ch>
 Shawn Rosti <shawn.rosti@gmail.com>
 Sherif Elhabbal <elhabbalsherif@gmail.com>
-Shivani Parekh <shparekh@ucdavis.edu>
-Shivani <shparekh@ucdavis.edu>
+Shivani Parekh <shparekh@ucdavis.edu> Shivani <shparekh@ucdavis.edu>
 Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
+Simon Park <seminpark@google.com>
 Somayeh Sardashti <somayeh@cs.wisc.edu>
-Sooraj Puthoor <puthoorsooraj@gmail.com>
-Sooraj Puthoor <Sooraj.Puthoor@amd.com>
+Sooraj Puthoor <puthoorsooraj@gmail.com> Sooraj Puthoor <Sooraj.Puthoor@amd.com>
 Sophiane Senni <sophiane.senni@gmail.com>
 Soumyaroop Roy <sroy@cse.usf.edu>
 Srikant Bharadwaj <srikant.bharadwaj@amd.com>
@@ -370,7 +403,6 @@ Steve Raasch <sraasch@umich.edu>
 Steve Reinhardt <stever@gmail.com> Steve Reinhardt ext:(%2C%20Nilay%20Vaish%20%3Cnilay%40cs.wisc.edu%3E%2C%20Ali%20Saidi%20%3CAli.Saidi%40ARM.com%3E) <stever@gmail.com>
 Steve Reinhardt <stever@gmail.com> Steve Reinhardt <stever@eecs.umich.edu>
 Steve Reinhardt <stever@gmail.com> Steve Reinhardt <steve.reinhardt@amd.com>
-Steve Reinhardt <stever@gmail.com> Steve Reinhardt <Steve.Reinhardt@amd.com>
 Stian Hvatum <stian@dream-web.no>
 Sudhanshu Jha <sudhanshu.jha@arm.com>
 Sujay Phadke <electronicsguy123@gmail.com>
@@ -378,16 +410,18 @@ Sungkeun Kim <ksungkeun84@tamu.edu>
 Swapnil Haria <swapnilster@gmail.com> Swapnil Haria <swapnilh@cs.wisc.edu>
 Taeho Kgil <tkgil@umich.edu>
 Tao Zhang <tao.zhang.0924@gmail.com>
+Thilo Vörtler <thilo.voertler@coseda-tech.com> root <thilo.voertler@coseda-tech.com>
 Thomas Grass <Thomas.Grass@ARM.com>
 Tiago Mück <tiago.muck@arm.com> Tiago Muck <tiago.muck@arm.com>
+Tiberiu Bucur <36485854+TiberiuBucur@users.noreply.github.com>
 Tim Harris <tharris@microsoft.com>
 Timothy Hayes <timothy.hayes@arm.com>
 Timothy M. Jones <timothy.jones@arm.com> Timothy Jones <timothy.jones@cl.cam.ac.uk>
 Timothy M. Jones <timothy.jones@arm.com> Timothy M. Jones <timothy.jones@cl.cam.ac.uk>
 Timothy M. Jones <timothy.jones@arm.com> Timothy M. Jones <tjones1@inf.ed.ac.uk>
 Tom Jablin <tjablin@gmail.com>
-Tommaso Marinelli <tommarin@ucm.es>
 Tom Rollet <tom.rollet@huawei.com>
+Tommaso Marinelli <tommarin@ucm.es>
 Tong Shen <endlessroad@google.com>
 Tony Gutierrez <anthony.gutierrez@amd.com> Anthony Gutierrez <atgutier@umich.edu>
 Travis Boraten <travis.boraten@amd.com>
@@ -401,6 +435,7 @@ Victor Garcia <victor.garcia@arm.com>
 Vilas Sridharan <vilas.sridharan@gmail.com>
 Vincentius Robby <acolyte@umich.edu>
 Vince Weaver <vince@csl.cornell.edu>
+Vishnu Ramadas <vramadas@outlook.com>
 vramadas95 <vramadas@wisc.edu>
 vsoria <victor.soria@bsc.es>
 Wade Walker <wade.walker@arm.com>
@@ -409,14 +444,16 @@ Weiping Liao <weipingliao@google.com>
 Wende Tan <twd2@163.com>
 Wendy Elsasser <wendy.elsasser@arm.com>
 William Wang <william.wang@arm.com> William Wang <William.Wang@arm.com>
-William Wang <william.wang@arm.com> William Wang <William.Wang@ARM.com>
 Willy Wolff <willy.mh.wolff.ml@gmail.com>
 Wing Li <wingers@google.com>
+wmin0 <wmin0@hotmail.com>
 Xiangyu Dong <rioshering@gmail.com>
 Xianwei Zhang <xianwei.zhang.@amd.com> Xianwei Zhang <xianwei.zhang@amd.com>
 Xiaoyu Ma <xiaoyuma@google.com>
 Xin Ouyang <xin.ouyang@streamcomputing.com>
 Xiongfei <xiongfei.liao@gmail.com>
+Xuan Hu <huxuan@bosc.ac.cn>
+Yan Lee <yanlee@google.com>
 Yasuko Eckert <yasuko.eckert@amd.com>
 Yen-lin Lai <yenlinlai@google.com>
 Yifei Liu <liu.ad2039@gmail.com>
@@ -426,7 +463,10 @@ Yuan Yao <yuanyao@seas.harvard.edu>
 Yuetsu Kodama <yuetsu.kodama@riken.jp> yuetsu.kodama <yuetsu.kodama@riken.jp>
 Yu-hsin Wang <yuhsingw@google.com>
 Zhang Zheng <perise@gmail.com>
-Zhantong Qiu <ztqiu@ucdavis.edu>
+Zhantong Qiu <ztqiu@ucdavis.edu> studyztp <studyztp@gmail.com>
 Zhengrong Wang <seanzw@ucla.edu> seanzw <seanyukigeek@gmail.com>
+Zhengrong Wang <seanzw@ucla.edu> Zhengrong Wang <seanyukigeek@gmail.com>
 zhongchengyong <zhongcy93@gmail.com>
 Zicong Wang <wangzicong@nudt.edu.cn>
+Zixian Cai <2891235+caizixian@users.noreply.github.com>
+zmckevitt <zack.mckevitt@gmail.com>
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index d99e7226f3..c2c6b382e0 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -1,3 +1,161 @@
+# Version 24.0
+
+gem5 Version 24.0 is the first major release of 2024.
+During this time there have been 298 pull requests merged, comprising of over 600 commits, from 56 unique contributors.
+
+## API and user-facing changes
+
+* The GCN3 GPU model has been removed in favor of the newer VEGA_X85 GPU model.
+* gem5 now supports building, running, and simulating Ubuntu 24.04.
+
+### Compiler and OS support
+
+As of this release gem5 support Clang version 6 to 16 and GCC version 10 to 13.
+While other compilers and versions may work, they are not regularly tested.
+
+gem5 now supports building, running, and simulating on Ubuntu 24.04.
+We continue to support 22.04 with 20.04 being deprecated in the coming year.
+The majority of our testing is done on Ubuntu LTS systems though Apple Silicon machines and other Linux distributions have also been used regularly during development.
+Improvements have been made to ensure a wider support of operating systems.
+
+## New features
+
+### gem5 MultiSim: Multiprocessing for gem5
+
+The gem5 "MultiSim" module allows for multiple simulations to be run from a single gem5 execution via a single gem5 configuration script.
+This allows for multiple simulations to be run in parallel in a structured manner.
+
+To use MultiSim first create multiple simulators and add them to the MultiSim with the `add_simulator` function.
+If needed, limit the maximum number of parallel processes with the `set_num_processes` function.
+Then run the simulations in parallel with the `gem5` binary using  `-m gem5.utils.multisim`.
+
+Here is an example of how to use MultiSim:
+
+```python
+import gem5.util.multisim as multisim
+
+# Set the maximum number of processes to run in parallel
+multisim.set_num_processes(4)
+
+# Create multiple simulators.
+# In this case, one for each workload in the benchmark suite.
+for workload in benchmark_suite:
+    board = X86Board(
+        # ...
+    )
+    board.set_workload(workload)
+
+    # Useful to set the ID here. This is used to create unique output
+    # directorires for each gem5 process and can be used to idenfify and
+    # run gem5 processes individually.
+    simulator = Simulator(board, id=f"{workload.get_id()}")
+    multisim.add_simulator(simulator)
+```
+
+Then to run the simulations in parallel:
+
+```sh
+<gem5 binary> -m gem5.utils.multisim <config script>
+```
+
+The output directory ("m5out" by default) will contain sub-directories for each simulation run.
+The sub-directory will be named after the simulator ID set in the configuration script.
+We therefore recommend setting the simulator ID to something meaningful to help identify the output directories (i.e., the workload run or something identifying the meaningful characteristics of the simulated system in comparison to others).
+
+If only one simulation specified in the config needs run, you can do so with:
+
+```sh
+<gem5 binary>  <config script> --list # Lists the simulations by ID
+
+<gem5 binary> <config script> <ID> # Run the simulation with the specified ID.
+```
+
+Example scripts of using MultiSim can be found in "configs/example/gem5_library/multisim".
+
+
+### RISC-V Vector Extension Support
+
+There have been significant improvements to the RVV support in gem5 including
+
+* Fixed viota (#1137)
+* Fixed vrgather (#1134)
+* Added RVV FP16 support (#1123)
+* Fixed widening and narrowing instructions (#1079)
+* Fixed bug in vfmv.f.s (#863)
+* Add unit stride segment loads and stores (#851) (#913)
+* Fix vl in masked load/store (#830)
+* Add unit-stride loads (#794)
+* Fix many RVV instructions (#814) (#805) (#715)
+
+### General RISC-V bugfixes
+
+* Fixed problem in TLB lookup (#1264)
+* Fixed sign-extended branch target (#1173)
+* Fixed compressed jump instructions (#1163)
+* Fixed GDB connection (#1152)
+* Fixed CSR behavior (#1099)
+* Add Integer conditional operations Zicond (#1078)
+* Add RISC-V Semihosting support (#681)
+* Added more detailed instruction types (#589)
+* Fixed 32-bit m5op arguments (#900)
+* Fixed c.fswsp and c.fsw (#998) (#1005)
+* Update PLIC implementation (#886)
+* Fix fflags behavior in O3 (#868)
+* Add support for local interrupts (#813)
+* Removebit 63 of physical address (#756)
+
+## Improvements
+
+* Added an new generator which can generate requests based on [spatter](https://github.com/hpcgarage/spatter) patterns.
+* KVM is now supported in the gem5 Standard Library ARM Board.
+* Generic Cache template added to the Standard Library: https://github.com/gem5/gem5/pull/745
+* Support added for partitioning caches.
+* The Standard Library `obtain_resources` function can request multiple resources at once thus reducing delay associated with multiple requests.
+* An official gem5 DevContainer has been added to the gem5 repository.
+This can be used to build and run gem5 in consistent environment and enables GitHub Codespaces support.
+
+### gem5 Python Statistics
+
+The gem5 Python statistics API has been improved.
+The gem5 Project's general intent with this improvement is make it easier and more desirable to obtain and interact with gem5 simulation statistics via Python.
+
+For example, the following code snippet demonstrates how to obtain statistics from a gem5 simulation:
+
+```python
+from m5.stats.gem5stats import get_simstat
+
+## Setup and run the configuation ...
+simstat = get_simstat(board)
+
+# Print the number of cycles the CPU at index 0 has executed.
+print(simstat.cpu[0].numCycles)
+
+# Strings can also be used to access statistics.
+print(simstat['cpu'][0]['numCycles'])
+
+# Print the total number of cycles executed by all CPUs.
+print(sum(simstat.cpu[i].numCycles for i in range(len(simstat.cpu))))
+```
+
+We hope the usage of the gem5 Python statistics API will be more intuitive and easier to use while allowing better processing of statistical data.
+
+### GPU Model
+
+* Support for MI300X and MI200 GPU models including their features and most instructions.
+* ROCm 6.1 disk image and compile docker files have been added. ROCm 5.4.2 and 4.2 resources are removed.
+* The deprecated GCN3 ISA has been removed. Use VEGA instead.
+
+## Bug Fixes
+
+* An integer overflow error known to affect the `AddrRange` class has been fixed.
+* Fix fflags behavior of floating point instruction in RISC-V for Out-of-Order CPUs.
+
+### Arm FEAT_MPAM Support
+
+An initial implementation of FEAT_MPAM has been introduced in gem5 with the capability to statically partition
+classic caches. Guidance on how to use this is available on a Arm community [blog post](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/gem5-cache-partitioning)
+
+
 # Version 23.1
 
 gem5 Version 23.1 is our first release where the development has been on GitHub.
diff --git a/SConstruct b/SConstruct
index ef4d154312..fa6e05177a 100755
--- a/SConstruct
+++ b/SConstruct
@@ -117,6 +117,8 @@ AddOption('--no-compress-debug', action='store_true',
           help="Don't compress debug info in build files")
 AddOption('--with-lto', action='store_true',
           help='Enable Link-Time Optimization')
+AddOption('--with-libcxx', action='store_true',
+          help='Use libc++ as the C++ standard library (requires Clang)')
 AddOption('--verbose', action='store_true',
           help='Print full tool command lines')
 AddOption('--without-python', action='store_true',
@@ -550,11 +552,6 @@ for variant_path in variant_paths:
         env.Append(CCFLAGS=['-pipe'])
         env.Append(CCFLAGS=['-fno-strict-aliasing'])
 
-        # Enable -Wall and -Wextra and then disable the few warnings that
-        # we consistently violate
-        env.Append(CCFLAGS=['-Wall', '-Wundef', '-Wextra',
-                            '-Wno-sign-compare', '-Wno-unused-parameter'])
-
         # We always compile using C++17
         env.Append(CXXFLAGS=['-std=c++17'])
 
@@ -567,6 +564,16 @@ for variant_path in variant_paths:
         with gem5_scons.Configure(env) as conf:
             conf.CheckLinkFlag('-Wl,--as-needed')
 
+        want_libcxx = GetOption('with_libcxx')
+        if want_libcxx:
+            with gem5_scons.Configure(env) as conf:
+                # Try using libc++ if it supports the <filesystem> library.
+                code = '#include <filesystem>\nint main() { return 0; }'
+                if (not conf.CheckCxxFlag('-stdlib=libc++') or
+                    not conf.CheckLinkFlag('-stdlib=libc++', code=code)
+                ):
+                    error('Requested libc++ but it is not usable')
+
         linker = GetOption('linker')
         if linker:
             with gem5_scons.Configure(env) as conf:
@@ -597,6 +604,13 @@ for variant_path in variant_paths:
                     env.Append(LINKFLAGS=['-Wl,--no-keep-memory'])
                 else:
                     error("Unable to use --no-keep-memory with the linker")
+
+        # Treat warnings as errors but white list some warnings that we
+        # want to allow (e.g., deprecation warnings).
+        env.Append(CCFLAGS=['-Werror',
+                             '-Wno-error=deprecated-declarations',
+                             '-Wno-error=deprecated',
+                            ])
     else:
         error('\n'.join((
               "Don't know what compiler options to use for your compiler.",
@@ -612,8 +626,8 @@ for variant_path in variant_paths:
               "src/SConscript to support that compiler.")))
 
     if env['GCC']:
-        if compareVersions(env['CXXVERSION'], "7") < 0:
-            error('gcc version 7 or newer required.\n'
+        if compareVersions(env['CXXVERSION'], "10") < 0:
+            error('gcc version 10 or newer required.\n'
                   'Installed version:', env['CXXVERSION'])
 
         # Add the appropriate Link-Time Optimization (LTO) flags if
@@ -637,17 +651,6 @@ for variant_path in variant_paths:
             '-fno-builtin-malloc', '-fno-builtin-calloc',
             '-fno-builtin-realloc', '-fno-builtin-free'])
 
-        if compareVersions(env['CXXVERSION'], "9") < 0:
-            # `libstdc++fs`` must be explicitly linked for `std::filesystem``
-            # in GCC version 8. As of GCC version 9, this is not required.
-            #
-            # In GCC 7 the `libstdc++fs`` library explicit linkage is also
-            # required but the `std::filesystem` is under the `experimental`
-            # namespace(`std::experimental::filesystem`).
-            #
-            # Note: gem5 does not support GCC versions < 7.
-            env.Append(LIBS=['stdc++fs'])
-
     elif env['CLANG']:
         if compareVersions(env['CXXVERSION'], "6") < 0:
             error('clang version 6 or newer required.\n'
@@ -665,7 +668,7 @@ for variant_path in variant_paths:
 
         env.Append(TCMALLOC_CCFLAGS=['-fno-builtin'])
 
-        if compareVersions(env['CXXVERSION'], "11") < 0:
+        if not want_libcxx and compareVersions(env['CXXVERSION'], "11") < 0:
             # `libstdc++fs`` must be explicitly linked for `std::filesystem``
             # in clang versions 6 through 10.
             #
@@ -679,7 +682,7 @@ for variant_path in variant_paths:
 
         # On Mac OS X/Darwin we need to also use libc++ (part of XCode) as
         # opposed to libstdc++, as the later is dated.
-        if sys.platform == "darwin":
+        if not want_libcxx and sys.platform == "darwin":
             env.Append(CXXFLAGS=['-stdlib=libc++'])
             env.Append(LIBS=['c++'])
 
@@ -688,20 +691,26 @@ for variant_path in variant_paths:
     if GetOption('with_ubsan'):
         sanitizers.append('undefined')
     if GetOption('with_asan'):
-        # Available for gcc >= 5 or llvm >= 3.1 both a requirement
-        # by the build system
-        sanitizers.append('address')
-        suppressions_file = Dir('util').File('lsan-suppressions').get_abspath()
-        suppressions_opt = 'suppressions=%s' % suppressions_file
-        suppressions_opts = ':'.join([suppressions_opt,
-                                      'print_suppressions=0'])
-        env['ENV']['LSAN_OPTIONS'] = suppressions_opts
-        print()
-        warning('To suppress false positive leaks, set the LSAN_OPTIONS '
-                'environment variable to "%s" when running gem5' %
-                suppressions_opts)
-        warning('LSAN_OPTIONS=%s' % suppressions_opts)
-        print()
+        if env['GCC']:
+            # Address sanitizer is not supported with GCC. Please see Github
+            # Issue https://github.com/gem5/gem5/issues/916 for more details.
+            warning("Address Sanitizer is not supported with GCC. "
+                    "This option will be ignored.")
+        else:
+            # Available for llvm >= 3.1. A requirement by the build system.
+            sanitizers.append('address')
+            suppressions_file = Dir('util').File('lsan-suppressions')\
+                                .get_abspath()
+            suppressions_opt = 'suppressions=%s' % suppressions_file
+            suppressions_opts = ':'.join([suppressions_opt,
+                                        'print_suppressions=0'])
+            env['ENV']['LSAN_OPTIONS'] = suppressions_opts
+            print()
+            warning('To suppress false positive leaks, set the LSAN_OPTIONS '
+                    'environment variable to "%s" when running gem5' %
+                    suppressions_opts)
+            warning('LSAN_OPTIONS=%s' % suppressions_opts)
+            print()
     if sanitizers:
         sanitizers = ','.join(sanitizers)
         if env['GCC'] or env['CLANG']:
diff --git a/build_opts/ALL b/build_opts/ALL
index b44c7a09f7..4f4ae1b8dc 100644
--- a/build_opts/ALL
+++ b/build_opts/ALL
@@ -7,3 +7,4 @@ USE_POWER_ISA=y
 USE_RISCV_ISA=y
 USE_SPARC_ISA=y
 USE_X86_ISA=y
+USE_TEST_OBJECTS=y
diff --git a/build_opts/GCN3_X86 b/build_opts/GCN3_X86
deleted file mode 100644
index fd471871b6..0000000000
--- a/build_opts/GCN3_X86
+++ /dev/null
@@ -1,6 +0,0 @@
-RUBY=y
-RUBY_PROTOCOL_GPU_VIPER=y
-BUILD_ISA=y
-USE_X86_ISA=y
-GCN3_GPU_ISA=y
-BUILD_GPU=y
diff --git a/build_tools/sim_object_param_struct_hh.py b/build_tools/sim_object_param_struct_hh.py
index c82c25921c..23e10a9bfa 100644
--- a/build_tools/sim_object_param_struct_hh.py
+++ b/build_tools/sim_object_param_struct_hh.py
@@ -211,8 +211,7 @@ code.indent()
 if sim_object == SimObject:
     code(
         """
-SimObjectParams() {}
-virtual ~SimObjectParams() {}
+virtual ~SimObjectParams() = default;
 
 std::string name;
     """
diff --git a/configs/deprecated/example/se.py b/configs/deprecated/example/se.py
index afdb82489d..6ad4b02b32 100644
--- a/configs/deprecated/example/se.py
+++ b/configs/deprecated/example/se.py
@@ -224,7 +224,7 @@ for cpu in system.cpu:
 if ObjectList.is_kvm_cpu(CPUClass) or ObjectList.is_kvm_cpu(FutureClass):
     if buildEnv["USE_X86_ISA"]:
         system.kvm_vm = KvmVM()
-        system.m5ops_base = 0xFFFF0000
+        system.m5ops_base = max(0xFFFF0000, Addr(args.mem_size).getValue())
         for process in multiprocesses:
             process.useArchPT = True
             process.kvmInSE = True
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index f9daf8a88b..eb7c625cad 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -335,6 +335,12 @@ parser.add_argument(
     default="dynamic",
     help="register allocation policy (simple/dynamic)",
 )
+parser.add_argument(
+    "--register-file-cache-size",
+    type=int,
+    default=0,
+    help="number of registers in cache",
+)
 
 parser.add_argument(
     "--dgpu",
@@ -369,11 +375,33 @@ parser.add_argument(
 parser.add_argument(
     "--gfx-version",
     type=str,
-    default="gfx801",
+    default="gfx902",
     choices=GfxVersion.vals,
     help="Gfx version for gpuNote: gfx902 is not fully supported by ROCm",
 )
 
+parser.add_argument(
+    "--tcp-rp",
+    type=str,
+    default="TreePLRURP",
+    help="cache replacement policy" "policy for tcp",
+)
+
+parser.add_argument(
+    "--tcc-rp",
+    type=str,
+    default="TreePLRURP",
+    help="cache replacement policy" "policy for tcc",
+)
+
+# sqc rp both changes sqc rp and scalar cache rp
+parser.add_argument(
+    "--sqc-rp",
+    type=str,
+    default="TreePLRURP",
+    help="cache replacement policy" "policy for sqc",
+)
+
 Ruby.define_options(parser)
 
 # add TLB options to the parser
@@ -428,6 +456,7 @@ print(
 # shader is the GPU
 shader = Shader(
     n_wf=args.wfs_per_simd,
+    cu_per_sqc=args.cu_per_sqc,
     clk_domain=SrcClockDomain(
         clock=args.gpu_clock,
         voltage_domain=VoltageDomain(voltage=args.gpu_voltage),
@@ -493,6 +522,7 @@ for i in range(n_cu):
     vrfs = []
     vrf_pool_mgrs = []
     srfs = []
+    rfcs = []
     srf_pool_mgrs = []
     for j in range(args.simds_per_cu):
         for k in range(shader.n_wf):
@@ -537,10 +567,16 @@ for i in range(n_cu):
                 simd_id=j, wf_size=args.wf_size, num_regs=args.sreg_file_size
             )
         )
+        rfcs.append(
+            RegisterFileCache(
+                simd_id=j, cache_size=args.register_file_cache_size
+            )
+        )
 
     compute_units[-1].wavefronts = wavefronts
     compute_units[-1].vector_register_file = vrfs
     compute_units[-1].scalar_register_file = srfs
+    compute_units[-1].register_file_cache = rfcs
     compute_units[-1].register_manager = RegisterManager(
         policy=args.registerManagerPolicy,
         vrf_pool_managers=vrf_pool_mgrs,
@@ -671,7 +707,7 @@ render_driver = GPURenderDriver(filename=f"dri/renderD{renderDriNum}")
 gpu_hsapp = HSAPacketProcessor(
     pioAddr=hsapp_gpu_map_paddr, numHWQueues=args.num_hw_queues
 )
-dispatcher = GPUDispatcher()
+dispatcher = GPUDispatcher(kernel_exit_events=True)
 gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp, dispatcher=dispatcher)
 gpu_driver.device = gpu_cmd_proc
 shader.dispatcher = dispatcher
@@ -798,6 +834,8 @@ if fast_forward:
 # configure the TLB hierarchy
 GPUTLBConfig.config_tlb_hierarchy(args, system, shader_idx)
 
+system.exit_on_work_items = True
+
 # create Ruby system
 system.piobus = IOXBar(
     width=32, response_latency=0, frontend_latency=0, forward_latency=0
@@ -938,19 +976,15 @@ root = Root(system=system, full_system=False)
 # knows what type of GPU hardware we are simulating
 if args.dgpu:
     assert args.gfx_version in [
-        "gfx803",
         "gfx900",
     ], "Incorrect gfx version for dGPU"
-    if args.gfx_version == "gfx803":
-        hsaTopology.createFijiTopology(args)
-    elif args.gfx_version == "gfx900":
+    if args.gfx_version == "gfx900":
         hsaTopology.createVegaTopology(args)
 else:
     assert args.gfx_version in [
-        "gfx801",
         "gfx902",
     ], "Incorrect gfx version for APU"
-    hsaTopology.createCarrizoTopology(args)
+    hsaTopology.createRavenTopology(args)
 
 m5.ticks.setGlobalFrequency("1THz")
 if args.abs_max_tick:
@@ -976,6 +1010,41 @@ if args.fast_forward:
 
 exit_event = m5.simulate(maxtick)
 
+while True:
+    if (
+        exit_event.getCause() == "m5_exit instruction encountered"
+        or exit_event.getCause() == "user interrupt received"
+        or exit_event.getCause() == "simulate() limit reached"
+        or "exiting with last active thread context" in exit_event.getCause()
+    ):
+        print(f"breaking loop due to: {exit_event.getCause()}.")
+        break
+    elif "checkpoint" in exit_event.getCause():
+        assert args.checkpoint_dir is not None
+        m5.checkpoint(args.checkpoint_dir)
+        print("breaking loop with checkpoint")
+        break
+    elif "GPU Kernel Completed" in exit_event.getCause():
+        print("GPU Kernel Completed dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    elif "GPU Blit Kernel Completed" in exit_event.getCause():
+        print("GPU Blit Kernel Completed dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    elif "workbegin" in exit_event.getCause():
+        print("m5 work begin dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    elif "workend" in exit_event.getCause():
+        print("m5 work end dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    else:
+        print(f"Unknown exit event: {exit_event.getCause()}. Continuing...")
+
+    exit_event = m5.simulate(maxtick - m5.curTick())
+
 if args.fast_forward:
     if exit_event.getCause() == "a thread reached the max instruction count":
         m5.switchCpus(system, switch_cpu_list)
diff --git a/configs/example/arm/starter_se.py b/configs/example/arm/starter_se.py
index 33cf7b2f40..6d5b06b9ae 100644
--- a/configs/example/arm/starter_se.py
+++ b/configs/example/arm/starter_se.py
@@ -53,15 +53,24 @@ from common import (
     MemConfig,
     ObjectList,
 )
-from common.cores.arm import HPI
+from common.cores.arm import (
+    HPI,
+    O3_ARM_v7a,
+)
 
 # Pre-defined CPU configurations. Each tuple must be ordered as : (cpu_class,
-# l1_icache_class, l1_dcache_class, walk_cache_class, l2_Cache_class). Any of
+# l1_icache_class, l1_dcache_class, l2_Cache_class). Any of
 # the cache class may be 'None' if the particular cache is not present.
 cpu_types = {
     "atomic": (AtomicSimpleCPU, None, None, None),
     "minor": (MinorCPU, devices.L1I, devices.L1D, devices.L2),
     "hpi": (HPI.HPI, HPI.HPI_ICache, HPI.HPI_DCache, HPI.HPI_L2),
+    "o3": (
+        O3_ARM_v7a.O3_ARM_v7a_3,
+        O3_ARM_v7a.O3_ARM_v7a_ICache,
+        O3_ARM_v7a.O3_ARM_v7a_DCache,
+        O3_ARM_v7a.O3_ARM_v7aL2,
+    ),
 }
 
 
diff --git a/configs/example/cache_partitioning.py b/configs/example/cache_partitioning.py
new file mode 100644
index 0000000000..9a363756e8
--- /dev/null
+++ b/configs/example/cache_partitioning.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2024 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This script showcases the functionality of cache partitioning policies,
+# containg a simple system comprised of a memory requestor (TrafficGen),
+# a cache enforcing policies for requests and a SimpleMemory backing store.
+#
+# Using the Way policy, the cache should show the following statistics in the
+# provided configuration:
+#
+# | Allocated Ways | 1 | 2   | 3   | 4   | 5   | 6   | 7   | 8    |
+# |----------------|---|-----|-----|-----|-----|-----|-----|------|
+# | Cache Hits     | 0 | 256 | 384 | 512 | 640 | 768 | 896 | 1024 |
+#
+# Using the MaxCapacity policy, expected results are the following:
+#
+# | Allocation % | 10 | 20  | 30  | 40  | 50  | 60  | 70  | 80  | 90  | 100  |
+# |--------------|----|-----|-----|-----|-----|-----|-----|-----|-----|------|
+# | Cache Hits   | 0  | 152 | 307 | 409 | 512 | 614 | 716 | 819 | 921 | 1024 |
+
+import argparse
+
+import m5
+from m5.objects import *
+
+
+def capacityAllocation(capacity_str):
+    """
+    Verify that Max Capacity partitioning policy has been provided with a suitable
+    configuration
+    """
+    capacity = float(capacity_str)
+
+    if capacity > 1 or capacity < 0:
+        raise argparse.ArgumentTypeError(
+            "Max Capacity Policy needs allocation in range [0, 1]"
+        )
+
+    return capacity
+
+
+def wayAllocation(way_str):
+    """
+    Verify that Way partitioning policy has been provided with a suitable
+    configuration
+    """
+    way_alloc = int(way_str)
+
+    if way_alloc < 0:
+        raise argparse.ArgumentTypeError(
+            "Way Policy needs positive number of ways"
+        )
+
+    return way_alloc
+
+
+def generatePartPolicy(args):
+    """
+    Generate Partitioning Policy object based on provided arguments
+    """
+    assert args.policy in [
+        "way",
+        "max_capacity",
+    ], "Only support generating way and max_capacity policies"
+
+    if args.policy == "way":
+        allocated_ways = [way for way in range(0, args.way_allocation)]
+        allocation = WayPolicyAllocation(partition_id=0, ways=allocated_ways)
+
+        return WayPartitioningPolicy(allocations=[allocation])
+
+    if args.policy == "max_capacity":
+        return MaxCapacityPartitioningPolicy(
+            partition_ids=[0], capacities=[args.capacity_allocation]
+        )
+
+
+def configSystem():
+    """
+    Configure base system and memory
+    """
+
+    system = System(membus=IOXBar(width=128))
+    system.clk_domain = SrcClockDomain(
+        clock="10THz",
+        voltage_domain=VoltageDomain(),
+    )
+
+    # Memory configuration
+    system.mem_ctrl = SimpleMemory(bandwidth="1GiB/s", latency="10ns")
+
+    # add memory
+    system.mem_ctrl.range = AddrRange("64KiB")
+    system.mem_ctrl.port = system.membus.mem_side_ports
+    return system
+
+
+parser = argparse.ArgumentParser(
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+parser.add_argument(
+    "--policy",
+    default="way",
+    choices=["way", "max_capacity"],
+    help="This option defines which Cache Partitioning Policy to use for "
+    "the system cache",
+)
+
+parser.add_argument(
+    "--capacity-allocation",
+    type=capacityAllocation,
+    default=0.5,
+    help="The amount of the cache to partition to the default PartitionID "
+    "when using Max Capacity Cache Partitioning Policy in [0,1] range",
+)
+
+parser.add_argument(
+    "--way-allocation",
+    type=wayAllocation,
+    default=4,
+    help="The number of ways in the cache to partition to the default "
+    "PartitionID when using Way Cache Partitioning Policy",
+)
+
+args = parser.parse_args()
+
+m5.ticks.setGlobalFrequency("10THz")
+system = configSystem()
+
+# create a cache to sit between the memory and traffic gen to enforce
+# partitioning policies
+part_manager = PartitionManager(
+    partitioning_policies=[generatePartPolicy(args)]
+)
+system.cache = NoncoherentCache(
+    size="64KiB",
+    assoc=8,
+    partitioning_manager=part_manager,
+    tag_latency=0,
+    data_latency=0,
+    response_latency=0,
+    mshrs=1,
+    tgts_per_mshr=8,
+    write_buffers=1,
+    replacement_policy=MRURP(),
+)
+system.cache.mem_side = system.membus.cpu_side_ports
+
+# instantiate traffic gen and connect to crossbar
+system.tgen = PyTrafficGen()
+system.tgen.port = system.cache.cpu_side
+
+# finalise config and run simulation
+root = Root(full_system=False, system=system)
+root.system.mem_mode = "timing"
+m5.instantiate()
+
+# configure traffic generator to do 2x 64KiB sequential reads from address 0
+# to 65536; one to warm up the cache one to test cache partitioning
+linear_tgen = system.tgen.createLinear(
+    1000000000, 0, 65536, 64, 1, 1, 100, 65536
+)
+exit_tgen = system.tgen.createExit(1)
+system.tgen.start([linear_tgen, linear_tgen, exit_tgen])
+
+# handle exit reporting
+exit_event = m5.simulate(2000000000)
+print(f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}")
diff --git a/configs/example/gem5_library/arm-hello.py b/configs/example/gem5_library/arm-hello.py
index 39583463e7..87d1f75d8c 100644
--- a/configs/example/gem5_library/arm-hello.py
+++ b/configs/example/gem5_library/arm-hello.py
@@ -84,7 +84,7 @@ board.set_se_binary_workload(
     # Any resource specified in this file will be automatically retrieved.
     # At the time of writing, this file is a WIP and does not contain all
     # resources. Jira ticket: https://gem5.atlassian.net/browse/GEM5-1096
-    obtain_resource("arm-hello64-static")
+    obtain_resource("arm-hello64-static", resource_version="1.0.0")
 )
 
 # Lastly we run the simulation.
diff --git a/configs/example/gem5_library/arm-ubuntu-run-with-kvm.py b/configs/example/gem5_library/arm-ubuntu-run-with-kvm.py
new file mode 100644
index 0000000000..62da70c023
--- /dev/null
+++ b/configs/example/gem5_library/arm-ubuntu-run-with-kvm.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2022-23 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+This script further shows an example of booting an ARM based full system Ubuntu
+disk image. This simulation boots the disk image using 2 TIMING CPU cores. The
+simulation ends when the startup is completed successfully (i.e. when an
+`m5_exit instruction is reached on successful boot).
+
+Usage
+-----
+
+```
+scons build/ARM/gem5.opt -j<NUM_CPUS>
+./build/ARM/gem5.opt configs/example/gem5_library/arm-ubuntu-run-with-kvm.py
+```
+
+"""
+
+from m5.objects import (
+    ArmDefaultRelease,
+    VExpress_GEM5_V1,
+)
+
+from gem5.coherence_protocol import CoherenceProtocol
+from gem5.components.boards.arm_board import ArmBoard
+from gem5.components.memory import DualChannelDDR4_2400
+from gem5.components.processors.cpu_types import CPUTypes
+from gem5.components.processors.simple_switchable_processor import (
+    SimpleSwitchableProcessor,
+)
+from gem5.isas import ISA
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.exit_event import ExitEvent
+from gem5.simulate.simulator import Simulator
+from gem5.utils.requires import requires
+
+# This runs a check to ensure the gem5 binary is compiled for ARM.
+requires(isa_required=ISA.ARM)
+
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
+    PrivateL1PrivateL2CacheHierarchy,
+)
+
+# Here we setup the parameters of the l1 and l2 caches.
+cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+    l1d_size="16kB", l1i_size="16kB", l2_size="256kB"
+)
+
+# Memory: Dual Channel DDR4 2400 DRAM device.
+memory = DualChannelDDR4_2400(size="2GB")
+
+# Here we setup the processor. This is a special switchable processor in which
+# a starting core type and a switch core type must be specified. Once a
+# configuration is instantiated a user may call `processor.switch()` to switch
+# from the starting core types to the switch core types. In this simulation
+# we start with KVM cores to simulate the OS boot, then switch to the Timing
+# cores for the command we wish to run after boot.
+processor = SimpleSwitchableProcessor(
+    starting_core_type=CPUTypes.KVM,
+    switch_core_type=CPUTypes.TIMING,
+    isa=ISA.ARM,
+    num_cores=2,
+)
+
+# The ArmBoard requires a `release` to be specified. This adds all the
+# extensions or features to the system. We are setting this to for_kvm()
+# to enable KVM simulation.
+release = ArmDefaultRelease.for_kvm()
+
+# The platform sets up the memory ranges of all the on-chip and off-chip
+# devices present on the ARM system. ARM KVM only works with VExpress_GEM5_V1
+# on the ArmBoard at the moment.
+platform = VExpress_GEM5_V1()
+
+# Here we setup the board. The ArmBoard allows for Full-System ARM simulations.
+board = ArmBoard(
+    clk_freq="3GHz",
+    processor=processor,
+    memory=memory,
+    cache_hierarchy=cache_hierarchy,
+    release=release,
+    platform=platform,
+)
+# This is the command to run after the system has booted. The first `m5 exit`
+# will stop the simulation so we can switch the CPU cores from KVM to timing
+# and continue the simulation to run the echo command, sleep for a second,
+# then, again, call `m5 exit` to terminate the simulation. After simulation
+# has ended you may inspect `m5out/system.pc.com_1.device` to see the echo
+# output.
+command = (
+    "m5 --addr=0x10010000 exit;"
+    + "echo 'This is running on Timing CPU cores.';"
+    + "m5 exit;"
+)
+
+# Here we set a full system workload. The "arm64-ubuntu-20.04-boot" boots
+# Ubuntu 20.04. We use arm64-bootloader (boot.arm64) as the bootloader to use
+# ARM KVM.
+board.set_kernel_disk_workload(
+    kernel=obtain_resource(
+        "arm64-linux-kernel-5.4.49", resource_version="1.0.0"
+    ),
+    disk_image=obtain_resource(
+        "arm64-ubuntu-20.04-img", resource_version="1.0.0"
+    ),
+    bootloader=obtain_resource("arm64-bootloader", resource_version="1.0.0"),
+    readfile_contents=command,
+)
+# We define the system with the aforementioned system defined.
+simulator = Simulator(
+    board=board,
+    on_exit_event={ExitEvent.EXIT: (func() for func in [processor.switch])},
+)
+
+# Once the system successfully boots, it encounters an
+# `m5_exit instruction encountered`. We stop the simulation then. When the
+# simulation has ended you may inspect `m5out/board.terminal` to see
+# the stdout.
+simulator.run()
diff --git a/configs/example/gem5_library/arm-ubuntu-run.py b/configs/example/gem5_library/arm-ubuntu-run.py
index 734fb9ee1b..4c784d6f9d 100644
--- a/configs/example/gem5_library/arm-ubuntu-run.py
+++ b/configs/example/gem5_library/arm-ubuntu-run.py
@@ -102,7 +102,9 @@ board = ArmBoard(
 # Here we set a full system workload. The "arm64-ubuntu-20.04-boot" boots
 # Ubuntu 20.04.
 
-board.set_workload(obtain_resource("arm64-ubuntu-20.04-boot"))
+board.set_workload(
+    obtain_resource("arm64-ubuntu-20.04-boot", resource_version="2.0.0")
+)
 
 # We define the system with the aforementioned system defined.
 
diff --git a/configs/example/gem5_library/caches/octopi-cache-example.py b/configs/example/gem5_library/caches/octopi-cache-example.py
index fa19773167..80a0c71865 100644
--- a/configs/example/gem5_library/caches/octopi-cache-example.py
+++ b/configs/example/gem5_library/caches/octopi-cache-example.py
@@ -97,7 +97,9 @@ board = ArmBoard(
     platform=platform,
 )
 
-board.set_workload(obtain_resource("arm64-ubuntu-20.04-boot"))
+board.set_workload(
+    obtain_resource("arm64-ubuntu-20.04-boot", resource_version="2.0.0")
+)
 
 simulator = Simulator(board=board)
 simulator.run()
diff --git a/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py b/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py
index aa78de5647..273ee92ae6 100644
--- a/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py
+++ b/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py
@@ -90,7 +90,9 @@ board = SimpleBoard(
 board.set_se_binary_workload(
     # the workload should be the same as the save-checkpoint script
     obtain_resource("riscv-hello"),
-    checkpoint=obtain_resource("riscv-hello-example-checkpoint"),
+    checkpoint=obtain_resource(
+        "riscv-hello-example-checkpoint", resource_version="3.0.0"
+    ),
 )
 
 simulator = Simulator(
diff --git a/configs/example/gem5_library/checkpoints/riscv-hello-save-checkpoint.py b/configs/example/gem5_library/checkpoints/riscv-hello-save-checkpoint.py
index b024a3a44a..c231203d56 100644
--- a/configs/example/gem5_library/checkpoints/riscv-hello-save-checkpoint.py
+++ b/configs/example/gem5_library/checkpoints/riscv-hello-save-checkpoint.py
@@ -107,8 +107,8 @@ board.set_se_binary_workload(
 
 # Lastly we run the simulation.
 max_ticks = 10**6
-simulator = Simulator(board=board, full_system=False)
-simulator.run(max_ticks=max_ticks)
+simulator = Simulator(board=board, full_system=False, max_ticks=max_ticks)
+simulator.run()
 
 print(
     "Exiting @ tick {} because {}.".format(
diff --git a/configs/example/gem5_library/checkpoints/simpoints-se-restore.py b/configs/example/gem5_library/checkpoints/simpoints-se-restore.py
index 284289be6f..a396869df3 100644
--- a/configs/example/gem5_library/checkpoints/simpoints-se-restore.py
+++ b/configs/example/gem5_library/checkpoints/simpoints-se-restore.py
@@ -60,8 +60,8 @@ from m5.stats import (
 )
 
 from gem5.components.boards.simple_board import SimpleBoard
-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )
 from gem5.components.memory import DualChannelDDR4_2400
 from gem5.components.processors.cpu_types import CPUTypes
@@ -80,7 +80,7 @@ requires(isa_required=ISA.X86)
 
 # The cache hierarchy can be different from the cache hierarchy used in taking
 # the checkpoints
-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
     l1d_size="32kB",
     l1i_size="32kB",
     l2_size="256kB",
@@ -125,7 +125,9 @@ board.set_se_simpoint_workload(
         weight_list=[0.1, 0.2, 0.4, 0.3],
         warmup_interval=1000000,
     ),
-    checkpoint=obtain_resource("simpoints-se-checkpoints-v23-0-v1"),
+    checkpoint=obtain_resource(
+        "simpoints-se-checkpoints", resource_version="3.0.0"
+    ),
 )
 
 
diff --git a/configs/example/gem5_library/dramsys/arm-hello-dramsys.py b/configs/example/gem5_library/dramsys/arm-hello-dramsys.py
index 2561f98fae..3a88d4ce9a 100644
--- a/configs/example/gem5_library/dramsys/arm-hello-dramsys.py
+++ b/configs/example/gem5_library/dramsys/arm-hello-dramsys.py
@@ -78,7 +78,7 @@ board.set_se_binary_workload(
     # Any resource specified in this file will be automatically retrieved.
     # At the time of writing, this file is a WIP and does not contain all
     # resources. Jira ticket: https://gem5.atlassian.net/browse/GEM5-1096
-    obtain_resource("arm-hello64-static")
+    obtain_resource("arm-hello64-static", resource_version="1.0.0")
 )
 
 # Lastly we run the simulation.
diff --git a/configs/example/gem5_library/looppoints/restore-looppoint-checkpoint.py b/configs/example/gem5_library/looppoints/restore-looppoint-checkpoint.py
index 781b2f7281..a97ea39d17 100644
--- a/configs/example/gem5_library/looppoints/restore-looppoint-checkpoint.py
+++ b/configs/example/gem5_library/looppoints/restore-looppoint-checkpoint.py
@@ -48,8 +48,8 @@ from m5.stats import (
 )
 
 from gem5.components.boards.simple_board import SimpleBoard
-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )
 from gem5.components.memory import DualChannelDDR4_2400
 from gem5.components.processors.cpu_types import CPUTypes
@@ -90,7 +90,7 @@ args = parser.parse_args()
 
 # The cache hierarchy can be different from the cache hierarchy used in taking
 # the checkpoints
-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
     l1d_size="32kB",
     l1i_size="32kB",
     l2_size="256kB",
diff --git a/configs/example/gem5_library/multisim/multisim-fs-x86-npb.py b/configs/example/gem5_library/multisim/multisim-fs-x86-npb.py
new file mode 100644
index 0000000000..eff2b0c48f
--- /dev/null
+++ b/configs/example/gem5_library/multisim/multisim-fs-x86-npb.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2024 The Regents of the University of California.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""An example of a single configuration script for defining multiple
+simulations through the gem5 `multisim` module.
+
+This script creates 6 full system simulations by interating through a suite
+of benchmarks and different cores counts.
+
+Usage
+-----
+
+1. To run all the simulations defined in this script::
+
+```shell
+<gem5-binary> -m gem5.utils.multisim \
+    configs/example/gem5_library/multisim/multisim-fs-x86-npb.py
+```
+
+2. To run a specific simulation defined in this script:
+
+```shell
+<gem5-binary> configs/example/gem5_library/multisim/multisim-fs-x86-npb.py \
+    <process_id> # e.g. npb-bt-a_cores-1
+```
+
+3. To list all the IDs of the simulations defined in this script:
+
+```shell
+<gem5-binary> configs/example/gem5_library/multisim/multisim-fs-x86-npb.py -l
+```
+"""
+
+import m5
+
+import gem5.utils.multisim as multisim
+from gem5.coherence_protocol import CoherenceProtocol
+from gem5.components.boards.x86_board import X86Board
+from gem5.components.memory import DualChannelDDR4_2400
+from gem5.components.processors.cpu_types import CPUTypes
+from gem5.components.processors.simple_switchable_processor import (
+    SimpleSwitchableProcessor,
+)
+from gem5.isas import ISA
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.simulator import (
+    ExitEvent,
+    Simulator,
+)
+from gem5.utils.requires import requires
+
+requires(
+    isa_required=ISA.X86,
+    coherence_protocol_required=CoherenceProtocol.MESI_TWO_LEVEL,
+)
+
+from gem5.components.cachehierarchies.ruby.mesi_two_level_cache_hierarchy import (
+    MESITwoLevelCacheHierarchy,
+)
+
+
+def handle_workbegin():
+    m5.stats.reset()
+    processor.switch()
+    yield False
+
+
+def handle_workend():
+    m5.stats.dump()
+    yield True
+
+
+# Set the maximum number of concurrent processes to be 3.
+multisim.set_num_processes(3)
+
+# Here we imagine an experiment wanting to run each NPB benchmark on the same
+# system twice: once with 1 core and once with 2 cores.
+for benchmark in obtain_resource("npb-benchmark-suite"):
+    for num_cores in [1, 2]:
+        cache_hierarchy = MESITwoLevelCacheHierarchy(
+            l1d_size="32kB",
+            l1i_size="32kB",
+            l2_size="256kB",
+            l1d_assoc=8,
+            l1i_assoc=8,
+            l2_assoc=16,
+            num_l2_banks=2,
+        )
+        memory = DualChannelDDR4_2400(size="3GB")
+        processor = SimpleSwitchableProcessor(
+            starting_core_type=CPUTypes.ATOMIC,
+            switch_core_type=CPUTypes.TIMING,
+            isa=ISA.X86,
+            num_cores=num_cores,
+        )
+        board = X86Board(
+            clk_freq="3GHz",
+            processor=processor,
+            memory=memory,
+            cache_hierarchy=cache_hierarchy,
+        )
+
+        board.set_workload(benchmark)
+
+        simulator = Simulator(
+            board=board,
+            on_exit_event={
+                ExitEvent.WORKBEGIN: handle_workbegin(),
+                ExitEvent.WORKEND: handle_workend(),
+            },
+        )
+
+        simulator.set_id(f"{benchmark.get_id()}_cores-{num_cores}")
+
+        multisim.add_simulator(simulator)
diff --git a/configs/example/gem5_library/multisim/multisim-print-this.py b/configs/example/gem5_library/multisim/multisim-print-this.py
new file mode 100644
index 0000000000..bd724a5d92
--- /dev/null
+++ b/configs/example/gem5_library/multisim/multisim-print-this.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""An example of a single configuration script for defining multiple
+simulations through the gem5 `multisim` module.
+
+This script is very simple and simply prints a simple message once for each
+simulation, outputing the process id.
+
+Usage
+-----
+
+1. To run all the simulations defined in this script::
+
+```shell
+<gem5-binary> -m gem5.utils.multisim \
+    configs/example/gem5_library/multisim/multisim-print-this.py
+```
+
+2. To run a specific simulation defined in this script:
+
+```shell
+<gem5-binary> configs/example/gem5_library/multisim/multisim-print-this.py \
+    process_id_1
+```
+
+3. To list all the IDs of the simulations defined in this script:
+
+```shell
+<gem5-binary> configs/example/gem5_library/multisim/multisim-print-this.py -l
+```
+"""
+
+
+import gem5.utils.multisim as multisim
+from gem5.components.boards.simple_board import SimpleBoard
+from gem5.components.cachehierarchies.classic.no_cache import NoCache
+from gem5.components.memory import SingleChannelDDR3_1600
+from gem5.components.processors.cpu_types import CPUTypes
+from gem5.components.processors.simple_processor import SimpleProcessor
+from gem5.isas import ISA
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.simulator import Simulator
+
+# Set the maximum number of concurrent processes to be 2.
+multisim.set_num_processes(2)
+
+for process_id in range(5):
+    cache_hierarchy = NoCache()
+    memory = SingleChannelDDR3_1600(size="32MB")
+    processor = SimpleProcessor(
+        cpu_type=CPUTypes.TIMING, isa=ISA.X86, num_cores=1
+    )
+    board = SimpleBoard(
+        clk_freq="1GHz",
+        processor=processor,
+        memory=memory,
+        cache_hierarchy=cache_hierarchy,
+    )
+    board.set_se_binary_workload(
+        binary=obtain_resource("x86-print-this"),
+        arguments=[f"Hello from process {process_id}", 1],
+    )
+    multisim.add_simulator(Simulator(board=board, id=f"process_{process_id}"))
diff --git a/configs/example/gem5_library/power-hello.py b/configs/example/gem5_library/power-hello.py
index 8a73b6a201..69106baace 100644
--- a/configs/example/gem5_library/power-hello.py
+++ b/configs/example/gem5_library/power-hello.py
@@ -75,7 +75,9 @@ board = SimpleBoard(
     cache_hierarchy=cache_hierarchy,
 )
 
-board.set_se_binary_workload(obtain_resource("power-hello"))
+board.set_se_binary_workload(
+    obtain_resource("power-hello", resource_version="1.0.0")
+)
 
 # Lastly we run the simulation.
 simulator = Simulator(board=board)
diff --git a/configs/example/gem5_library/riscv-fs.py b/configs/example/gem5_library/riscv-fs.py
index 914d9a7023..5f37c259ed 100644
--- a/configs/example/gem5_library/riscv-fs.py
+++ b/configs/example/gem5_library/riscv-fs.py
@@ -40,8 +40,8 @@ Characteristics
 """
 
 from gem5.components.boards.riscv_board import RiscvBoard
-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )
 from gem5.components.memory import SingleChannelDDR3_1600
 from gem5.components.processors.cpu_types import CPUTypes
@@ -57,7 +57,7 @@ requires(isa_required=ISA.RISCV)
 # Setup the cache hierarchy.
 # For classic, PrivateL1PrivateL2 and NoCache have been tested.
 # For Ruby, MESI_Two_Level and MI_example have been tested.
-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
     l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
 )
 
@@ -79,8 +79,10 @@ board = RiscvBoard(
 
 # Set the Full System workload.
 board.set_kernel_disk_workload(
-    kernel=obtain_resource("riscv-bootloader-vmlinux-5.10"),
-    disk_image=obtain_resource("riscv-disk-img"),
+    kernel=obtain_resource(
+        "riscv-bootloader-vmlinux-5.10", resource_version="1.0.0"
+    ),
+    disk_image=obtain_resource("riscv-disk-img", resource_version="1.0.0"),
 )
 
 simulator = Simulator(board=board)
diff --git a/configs/example/gem5_library/riscv-ubuntu-run.py b/configs/example/gem5_library/riscv-ubuntu-run.py
index 1d31b055de..c236b69169 100644
--- a/configs/example/gem5_library/riscv-ubuntu-run.py
+++ b/configs/example/gem5_library/riscv-ubuntu-run.py
@@ -57,12 +57,12 @@ from gem5.utils.requires import requires
 requires(isa_required=ISA.RISCV)
 
 # With RISCV, we use simple caches.
-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )
 
 # Here we setup the parameters of the l1 and l2 caches.
-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
     l1d_size="16kB", l1i_size="16kB", l2_size="256kB"
 )
 
@@ -88,7 +88,9 @@ board = RiscvBoard(
 # Ubuntu 20.04. Once the system successfully boots it encounters an `m5_exit`
 # instruction which stops the simulation. When the simulation has ended you may
 # inspect `m5out/system.pc.com_1.device` to see the stdout.
-board.set_workload(obtain_resource("riscv-ubuntu-20.04-boot"))
+board.set_workload(
+    obtain_resource("riscv-ubuntu-20.04-boot", resource_version="3.0.0")
+)
 
 simulator = Simulator(board=board)
 simulator.run()
diff --git a/configs/example/gem5_library/riscvmatched-fs.py b/configs/example/gem5_library/riscvmatched-fs.py
index ad045cac3d..34faed0b2c 100644
--- a/configs/example/gem5_library/riscvmatched-fs.py
+++ b/configs/example/gem5_library/riscvmatched-fs.py
@@ -76,7 +76,7 @@ board = RISCVMatchedBoard(
 # In the case where the `-i` flag is passed, we add the kernel argument
 # `init=/root/exit.sh`. This means the simulation will exit after the Linux
 # Kernel has booted.
-workload = obtain_resource("riscv-ubuntu-20.04-boot")
+workload = obtain_resource("riscv-ubuntu-20.04-boot", resource_version="3.0.0")
 kernel_args = board.get_default_kernel_args()
 if args.to_init:
     kernel_args.append("init=/root/exit.sh")
diff --git a/configs/example/gem5_library/riscvmatched-hello.py b/configs/example/gem5_library/riscvmatched-hello.py
index 3ea13b4851..f95bb051e9 100644
--- a/configs/example/gem5_library/riscvmatched-hello.py
+++ b/configs/example/gem5_library/riscvmatched-hello.py
@@ -49,7 +49,9 @@ requires(isa_required=ISA.RISCV)
 board = RISCVMatchedBoard()
 
 # set the hello world riscv binary as the board workload
-board.set_se_binary_workload(obtain_resource("riscv-hello"))
+board.set_se_binary_workload(
+    obtain_resource("riscv-hello", resource_version="1.0.0")
+)
 
 # run the simulation with the RISCV Matched board
 simulator = Simulator(board=board, full_system=False)
diff --git a/configs/example/gem5_library/riscvmatched-microbenchmark-suite.py b/configs/example/gem5_library/riscvmatched-microbenchmark-suite.py
index 2024bdddf0..01a274b39d 100644
--- a/configs/example/gem5_library/riscvmatched-microbenchmark-suite.py
+++ b/configs/example/gem5_library/riscvmatched-microbenchmark-suite.py
@@ -45,7 +45,9 @@ requires(isa_required=ISA.RISCV)
 board = RISCVMatchedBoard()
 
 # obtain the RISC-V Vertical Microbenchmarks
-microbenchmarks = obtain_resource("riscv-vertical-microbenchmarks")
+microbenchmarks = obtain_resource(
+    "riscv-vertical-microbenchmarks", resource_version="1.0.0"
+)
 
 # list all the microbenchmarks present in the suite
 print("Microbenchmarks present in the suite:")
diff --git a/configs/example/gem5_library/spatter_gen/spatter-gen-test.py b/configs/example/gem5_library/spatter_gen/spatter-gen-test.py
new file mode 100644
index 0000000000..ef0cc04aa5
--- /dev/null
+++ b/configs/example/gem5_library/spatter_gen/spatter-gen-test.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+Script that runs a SpatterGen test with a specific trace file.
+This script can be used as an example on how to use SpatterGenerator,
+SpatterKernel, and its utilities to run a Spatter trace in gem5.
+
+The script uses a spatter trace taken from the hpcgarage github repository.
+Link to the original trace file:
+
+https://github.com/hpcgarage/spatter/blob/main/standard-suite/app-traces/amg.json
+
+It will create a system with `num_cores` SpatterGenerators and interleave the
+trace by `intlv_size` elements in the `pattern` field from the trace.
+Interleaving is done for assigning part of the access to each core.
+
+Usage:
+------
+
+```
+scons build/NULL/gem5.opt
+./build/NULL/gem5.opt configs/example/gem5_library/spatter_gen/spatter-gen-test.py
+```
+"""
+import argparse
+import json
+from pathlib import Path
+
+import m5
+from m5.objects import Root
+
+from gem5.components.boards.test_board import TestBoard
+from gem5.components.cachehierarchies.classic.private_l1_cache_hierarchy import (
+    PrivateL1CacheHierarchy,
+)
+from gem5.components.memory import DualChannelDDR4_2400
+from gem5.components.processors.spatter_gen import (
+    SpatterGenerator,
+    prepare_kernels,
+)
+from gem5.simulate.simulator import Simulator
+
+num_cores = 8
+intlv_size = 128
+
+memory = DualChannelDDR4_2400(size="8GiB")
+
+generator = SpatterGenerator(
+    processing_mode="synchronous", num_cores=num_cores
+)
+
+kernels = prepare_kernels(
+    Path(__file__).parent / "traces/amg.json",
+    num_cores,
+    intlv_size,
+    0,
+    memory.get_size() // 2,
+)
+for kernel in kernels:
+    generator.add_kernel(kernel)
+
+board = TestBoard(
+    clk_freq="4GHz",
+    generator=generator,
+    cache_hierarchy=PrivateL1CacheHierarchy(
+        l1d_size="32KiB", l1i_size="32KiB"
+    ),
+    memory=memory,
+)
+
+simulator = Simulator(board=board, full_system=False)
+
+simulator.run()
diff --git a/configs/example/gem5_library/spatter_gen/traces/amg.json b/configs/example/gem5_library/spatter_gen/traces/amg.json
new file mode 100644
index 0000000000..64da33a2e1
--- /dev/null
+++ b/configs/example/gem5_library/spatter_gen/traces/amg.json
@@ -0,0 +1 @@
+[{"delta": 1, "kernel": "Gather", "pattern": [1333, 0, 1, 2, 36, 37, 38, 72, 73, 74, 1296, 1297, 1298, 1332, 1334, 1368], "count": 1454647}, {"delta": 1, "kernel": "Gather", "pattern": [1333, 0, 1, 36, 37, 72, 73, 1296, 1297, 1332, 1368, 1369, 2592, 2593, 2628, 2629], "count": 1454647}]
diff --git a/configs/example/gem5_library/x86-gapbs-benchmarks.py b/configs/example/gem5_library/x86-gapbs-benchmarks.py
index 4ef6f52b9c..9864deaae9 100644
--- a/configs/example/gem5_library/x86-gapbs-benchmarks.py
+++ b/configs/example/gem5_library/x86-gapbs-benchmarks.py
@@ -77,7 +77,9 @@ parser = argparse.ArgumentParser(
     description="An example configuration script to run the gapbs benchmarks."
 )
 
-gapbs_suite = obtain_resource("gapbs-benchmark-suite")
+gapbs_suite = obtain_resource(
+    "gapbs-benchmark-suite", resource_version="1.0.0"
+)
 
 # The only positional argument accepted is the benchmark name in this script.
 
diff --git a/configs/example/gem5_library/x86-npb-benchmarks.py b/configs/example/gem5_library/x86-npb-benchmarks.py
index 6e6d501c37..a578522a4a 100644
--- a/configs/example/gem5_library/x86-npb-benchmarks.py
+++ b/configs/example/gem5_library/x86-npb-benchmarks.py
@@ -88,7 +88,7 @@ parser = argparse.ArgumentParser(
     description="An example configuration script to run the npb benchmarks."
 )
 
-npb_suite = obtain_resource("npb-benchmark-suite")
+npb_suite = obtain_resource("npb-benchmark-suite", resource_version="1.0.0")
 # The only positional argument accepted is the benchmark name in this script.
 
 parser.add_argument(
diff --git a/configs/example/gem5_library/x86-parsec-benchmarks.py b/configs/example/gem5_library/x86-parsec-benchmarks.py
index 71cfd4a9ef..36f56c4b95 100644
--- a/configs/example/gem5_library/x86-parsec-benchmarks.py
+++ b/configs/example/gem5_library/x86-parsec-benchmarks.py
@@ -185,10 +185,12 @@ board.set_kernel_disk_workload(
     # The x86 linux kernel will be automatically downloaded to the
     # `~/.cache/gem5` directory if not already present.
     # PARSEC benchamarks were tested with kernel version 4.19.83
-    kernel=obtain_resource("x86-linux-kernel-4.19.83"),
+    kernel=obtain_resource(
+        "x86-linux-kernel-4.19.83", resource_version="1.0.0"
+    ),
     # The x86-parsec image will be automatically downloaded to the
     # `~/.cache/gem5` directory if not already present.
-    disk_image=obtain_resource("x86-parsec"),
+    disk_image=obtain_resource("x86-parsec", resource_version="1.0.0"),
     readfile_contents=command,
 )
 
diff --git a/configs/example/gem5_library/x86-ubuntu-run-with-kvm-no-perf.py b/configs/example/gem5_library/x86-ubuntu-run-with-kvm-no-perf.py
index 632b409b16..d96ff80a3c 100644
--- a/configs/example/gem5_library/x86-ubuntu-run-with-kvm-no-perf.py
+++ b/configs/example/gem5_library/x86-ubuntu-run-with-kvm-no-perf.py
@@ -121,7 +121,7 @@ command = (
     + "m5 exit;"
 )
 
-workload = obtain_resource("x86-ubuntu-18.04-boot")
+workload = obtain_resource("x86-ubuntu-18.04-boot", resource_version="2.0.0")
 workload.set_parameter("readfile_contents", command)
 board.set_workload(workload)
 
diff --git a/configs/example/gem5_library/x86-ubuntu-run-with-kvm.py b/configs/example/gem5_library/x86-ubuntu-run-with-kvm.py
index ec361dcd6e..b9d035757c 100644
--- a/configs/example/gem5_library/x86-ubuntu-run-with-kvm.py
+++ b/configs/example/gem5_library/x86-ubuntu-run-with-kvm.py
@@ -117,7 +117,7 @@ command = (
     + "m5 exit;"
 )
 
-workload = obtain_resource("x86-ubuntu-18.04-boot")
+workload = obtain_resource("x86-ubuntu-18.04-boot", resource_version="2.0.0")
 workload.set_parameter("readfile_contents", command)
 board.set_workload(workload)
 
diff --git a/configs/example/gem5_library/x86-ubuntu-run.py b/configs/example/gem5_library/x86-ubuntu-run.py
index 3b7b754b90..a8737d7297 100644
--- a/configs/example/gem5_library/x86-ubuntu-run.py
+++ b/configs/example/gem5_library/x86-ubuntu-run.py
@@ -55,7 +55,9 @@ board = X86DemoBoard()
 # We then set the workload. Here we use the "x86-ubuntu-18.04-boot" workload.
 # This boots Ubuntu 18.04 with Linux 5.4.49. If the required resources are not
 # found locally, they will be downloaded.
-board.set_workload(obtain_resource("x86-ubuntu-18.04-boot"))
+board.set_workload(
+    obtain_resource("x86-ubuntu-18.04-boot", resource_version="2.0.0")
+)
 
 simulator = Simulator(board=board)
 simulator.run()
diff --git a/configs/example/gpufs/Disjoint_VIPER.py b/configs/example/gpufs/Disjoint_VIPER.py
index 28f0768c2a..0fd258e0fd 100644
--- a/configs/example/gpufs/Disjoint_VIPER.py
+++ b/configs/example/gpufs/Disjoint_VIPER.py
@@ -58,6 +58,8 @@ class Disjoint_VIPER(RubySystem):
             self.network_cpu = DisjointSimple(self)
             self.network_gpu = DisjointSimple(self)
 
+        self.block_size_bytes = options.cacheline_size
+
         # Construct CPU controllers
         cpu_dir_nodes = construct_dirs(options, system, self, self.network_cpu)
         (cp_sequencers, cp_cntrl_nodes) = construct_corepairs(
diff --git a/configs/example/gpufs/amd/AmdGPUOptions.py b/configs/example/gpufs/amd/AmdGPUOptions.py
index 3d6a8cc48e..9996d33a2e 100644
--- a/configs/example/gpufs/amd/AmdGPUOptions.py
+++ b/configs/example/gpufs/amd/AmdGPUOptions.py
@@ -247,3 +247,9 @@ def addAmdGPUOptions(parser):
         default="simple",
         help="register allocation policy (simple/dynamic)",
     )
+    parser.add_argument(
+        "--register-file-cache-size",
+        type=int,
+        default=0,
+        help="number of registers in cache",
+    )
diff --git a/configs/example/gpufs/mi200.py b/configs/example/gpufs/mi200.py
new file mode 100644
index 0000000000..cc4f5df787
--- /dev/null
+++ b/configs/example/gpufs/mi200.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import base64
+import os
+import sys
+import tempfile
+
+import runfs
+from amd import AmdGPUOptions
+from common import (
+    GPUTLBOptions,
+    Options,
+)
+from ruby import Ruby
+
+import m5
+
+demo_runscript_without_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx90a
+free -m
+dmesg -n8
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+./myapp {}
+/sbin/m5 exit
+"""
+
+demo_runscript_with_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx90a
+dmesg -n8
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+/sbin/m5 checkpoint
+./myapp {}
+/sbin/m5 exit
+"""
+
+
+def addDemoOptions(parser):
+    parser.add_argument(
+        "-a", "--app", default=None, help="GPU application to run"
+    )
+    parser.add_argument(
+        "-o", "--opts", default="", help="GPU application arguments"
+    )
+
+
+def runMI200GPUFS(cpu_type):
+    parser = argparse.ArgumentParser()
+    runfs.addRunFSOptions(parser)
+    Options.addCommonOptions(parser)
+    AmdGPUOptions.addAmdGPUOptions(parser)
+    Ruby.define_options(parser)
+    GPUTLBOptions.tlb_options(parser)
+    addDemoOptions(parser)
+
+    # Parse now so we can override options
+    args = parser.parse_args()
+    demo_runscript = ""
+
+    # Create temp script to run application
+    if args.app is None:
+        print(f"No application given. Use {sys.argv[0]} -a <app>")
+        sys.exit(1)
+    elif args.kernel is None:
+        print(f"No kernel path given. Use {sys.argv[0]} --kernel <vmlinux>")
+        sys.exit(1)
+    elif args.disk_image is None:
+        print(f"No disk path given. Use {sys.argv[0]} --disk-image <linux>")
+        sys.exit(1)
+    elif not os.path.isfile(args.app):
+        print("Could not find applcation", args.app)
+        sys.exit(1)
+
+    # Choose runscript Based on whether any checkpointing args are set
+    if args.checkpoint_dir is not None:
+        demo_runscript = demo_runscript_with_checkpoint
+    else:
+        demo_runscript = demo_runscript_without_checkpoint
+
+    with open(os.path.abspath(args.app), "rb") as binfile:
+        encodedBin = base64.b64encode(binfile.read()).decode()
+
+    _, tempRunscript = tempfile.mkstemp()
+    with open(tempRunscript, "w") as b64file:
+        runscriptStr = demo_runscript.format(
+            args.app, args.opts, encodedBin, args.opts
+        )
+        b64file.write(runscriptStr)
+
+    if args.second_disk == None:
+        args.second_disk = args.disk_image
+
+    # Defaults for MI200
+    args.ruby = True
+    args.cpu_type = "X86KvmCPU"
+    args.mem_size = "8GB"  # CPU host memory
+    args.dgpu = True
+    args.dgpu_mem_size = "16GB"  # GPU device memory
+    args.dgpu_start = "0GB"
+    args.checkpoint_restore = 0
+    args.disjoint = True
+    args.timing_gpu = True
+    args.script = tempRunscript
+    args.dgpu_xor_low_bit = 0
+    args.gpu_device = "MI200"
+
+    # Run gem5
+    runfs.runGpuFSSystem(args)
+
+
+if __name__ == "__m5_main__":
+    runMI200GPUFS("X86KvmCPU")
diff --git a/configs/example/gpufs/mi300.py b/configs/example/gpufs/mi300.py
new file mode 100644
index 0000000000..9e0e0da622
--- /dev/null
+++ b/configs/example/gpufs/mi300.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+""" This file creates an X86 system with a KVM CPU and GPU device capable of
+running the MI300 ISA (gfx942). Most of this file sets up a runscript which
+will load in a binary, shell script, or python file from the host and run that
+within gem5. Jump to line 146 for list of system parameters to configure.
+"""
+
+import argparse
+import base64
+import os
+import sys
+import tempfile
+from typing import Optional
+
+import runfs
+from amd import AmdGPUOptions
+from common import (
+    GPUTLBOptions,
+    Options,
+)
+from ruby import Ruby
+
+import m5
+
+from gem5.resources.resource import AbstractResource
+
+demo_runscript_without_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx942
+export HSA_OVERRIDE_GFX_VERSION="9.4.2"
+dmesg -n8
+cat /proc/cpuinfo
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+./myapp {}
+/sbin/m5 exit
+"""
+
+demo_runscript_with_checkpoint = """\
+export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export HSA_ENABLE_INTERRUPT=0
+export HCC_AMDGPU_TARGET=gfx942
+export HSA_OVERRIDE_GFX_VERSION="9.4.2"
+dmesg -n8
+dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128
+if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
+    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
+    /sbin/m5 exit
+fi
+modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0
+echo "Running {} {}"
+echo "{}" | base64 -d > myapp
+chmod +x myapp
+/sbin/m5 checkpoint
+./myapp {}
+/sbin/m5 exit
+"""
+
+
+def addDemoOptions(parser):
+    parser.add_argument(
+        "-a", "--app", default=None, help="GPU application to run"
+    )
+    parser.add_argument(
+        "-o", "--opts", default="", help="GPU application arguments"
+    )
+
+
+def runMI300GPUFS(
+    cpu_type,
+    disk: Optional[AbstractResource] = None,
+    kernel: Optional[AbstractResource] = None,
+    app: Optional[AbstractResource] = None,
+):
+    parser = argparse.ArgumentParser()
+    runfs.addRunFSOptions(parser)
+    Options.addCommonOptions(parser)
+    AmdGPUOptions.addAmdGPUOptions(parser)
+    Ruby.define_options(parser)
+    GPUTLBOptions.tlb_options(parser)
+    addDemoOptions(parser)
+
+    # Parse now so we can override options
+    args = parser.parse_args()
+    demo_runscript = ""
+
+    if disk != None:
+        args.disk_image = disk.get_local_path()
+    if kernel != None:
+        args.kernel = kernel.get_local_path()
+    if app != None:
+        args.app = app.get_local_path()
+
+    # Create temp script to run application
+    if not os.path.isfile(args.app):
+        print("Could not find applcation", args.app)
+        sys.exit(1)
+
+    # Choose runscript Based on whether any checkpointing args are set
+    if args.checkpoint_dir is not None:
+        demo_runscript = demo_runscript_with_checkpoint
+    else:
+        demo_runscript = demo_runscript_without_checkpoint
+
+    with open(os.path.abspath(args.app), "rb") as binfile:
+        encodedBin = base64.b64encode(binfile.read()).decode()
+
+    _, tempRunscript = tempfile.mkstemp()
+    with open(tempRunscript, "w") as b64file:
+        runscriptStr = demo_runscript.format(
+            args.app, args.opts, encodedBin, args.opts
+        )
+        b64file.write(runscriptStr)
+
+    args.script = tempRunscript
+
+    # Defaults for CPU
+    args.cpu_type = "X86KvmCPU"
+    args.mem_size = "8GB"
+
+    # Defaults for MI300X
+    args.gpu_device = "MI300X"
+    args.dgpu_mem_size = "16GB"  # GPU memory size, must be 16GB currently.
+
+    # See: https://rocm.docs.amd.com/en/latest/conceptual/gpu-arch/mi300.html
+    # Topology for one XCD. Number of CUs is approximately 304 / 8, rounded
+    # up to 40 due to gem5 restriction of 4 CUs per SQC / scalar cache.
+    args.num_compute_units = 40
+    args.gpu_topology = "Crossbar"
+
+    # Run gem5
+    runfs.runGpuFSSystem(args)
+
+
+if __name__ == "__m5_main__":
+    runMI300GPUFS("X86KvmCPU")
diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index 9dcc1187f3..2220c33df5 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -134,23 +134,41 @@ def addRunFSOptions(parser):
     parser.add_argument(
         "--gpu-device",
         default="Vega10",
-        choices=["Vega10", "MI100", "MI200"],
-        help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or "
-        "MI200 (gfx90a)",
+        choices=["Vega10", "MI100", "MI200", "MI300X"],
+        help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), MI200 "
+        "(gfx90a), or MI300X (gfx942).",
     )
 
     parser.add_argument(
-        "--debug-at-gpu-kernel",
+        "--debug-at-gpu-task",
         type=int,
         default=-1,
-        help="Turn on debug flags starting with this kernel",
+        help="Turn on debug flags starting with this task (counting both blit"
+        " and non-blit kernels)",
     )
 
     parser.add_argument(
-        "--exit-at-gpu-kernel",
+        "--exit-at-gpu-task",
         type=int,
         default=-1,
-        help="Exit simulation after running this many kernels",
+        help="Exit simulation after running this many tasks (counting both "
+        "blit and non-blit kernels)",
+    )
+
+    parser.add_argument(
+        "--exit-after-gpu-kernel",
+        type=int,
+        default=-1,
+        help="Exit simulation after completing this (non-blit) kernel",
+    )
+
+    parser.add_argument(
+        "--skip-until-gpu-kernel",
+        type=int,
+        default=0,
+        help="Skip (non-blit) kernels until reaching this kernel. Note that "
+        "this can impact correctness (the skipped kernels are completely "
+        "skipped, not fast forwarded)",
     )
 
     parser.add_argument(
@@ -177,6 +195,28 @@ def addRunFSOptions(parser):
         help="Disable KVM perf counters (use this with LSF / ETX)",
     )
 
+    parser.add_argument(
+        "--tcp-rp",
+        type=str,
+        default="TreePLRURP",
+        help="cache replacement policy" "policy for tcp",
+    )
+
+    parser.add_argument(
+        "--tcc-rp",
+        type=str,
+        default="TreePLRURP",
+        help="cache replacement policy" "policy for tcc",
+    )
+
+    # sqc rp both changes sqc rp and scalar cache rp
+    parser.add_argument(
+        "--sqc-rp",
+        type=str,
+        default="TreePLRURP",
+        help="cache replacement policy" "policy for sqc",
+    )
+
 
 def runGpuFSSystem(args):
     """
@@ -230,8 +270,9 @@ def runGpuFSSystem(args):
 
     print("Running the simulation")
     sim_ticks = args.abs_max_tick
-    kernels_launched = 0
-    if args.debug_at_gpu_kernel != -1:
+    kernels_completed = 0
+    tasks_completed = 0
+    if args.debug_at_gpu_task != -1:
         m5.trace.disable()
 
     exit_event = m5.simulate(sim_ticks)
@@ -249,16 +290,27 @@ def runGpuFSSystem(args):
             m5.checkpoint(args.checkpoint_dir)
             break
         elif "GPU Kernel Completed" in exit_event.getCause():
-            kernels_launched += 1
+            if kernels_completed == args.exit_after_gpu_kernel:
+                print(f"Exiting after GPU kernel {kernels_completed}")
+                break
+            kernels_completed += 1
+            tasks_completed += 1
+        elif "GPU Blit Kernel Completed" in exit_event.getCause():
+            tasks_completed += 1
+        elif "Skipping GPU Kernel" in exit_event.getCause():
+            print(f"Skipping GPU kernel {kernels_completed}")
+            kernels_completed += 1
+            tasks_completed += 1
         else:
             print(
                 f"Unknown exit event: {exit_event.getCause()}. Continuing..."
             )
 
-        if kernels_launched == args.debug_at_gpu_kernel:
+        if tasks_completed == args.debug_at_gpu_task:
+            print(f"Enabling debug flags @ GPU task {tasks_completed}")
             m5.trace.enable()
-        if kernels_launched == args.exit_at_gpu_kernel:
-            print(f"Exiting @ GPU kernel {kernels_launched}")
+        if tasks_completed == args.exit_at_gpu_task:
+            print(f"Exiting @ GPU task {tasks_completed}")
             break
 
         exit_event = m5.simulate(sim_ticks - m5.curTick())
diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py
index 4bca52c77e..55937cd255 100644
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -33,7 +33,10 @@ from m5.objects import *
 
 def createGPU(system, args):
     shader = Shader(
-        n_wf=args.wfs_per_simd, timing=True, clk_domain=system.clk_domain
+        n_wf=args.wfs_per_simd,
+        cu_per_sqc=args.cu_per_sqc,
+        timing=True,
+        clk_domain=system.clk_domain,
     )
 
     # VIPER GPU protocol implements release consistency at GPU side. So,
@@ -84,6 +87,7 @@ def createGPU(system, args):
         vrfs = []
         vrf_pool_mgrs = []
         srfs = []
+        rfcs = []
         srf_pool_mgrs = []
         for j in range(args.simds_per_cu):
             for k in range(shader.n_wf):
@@ -133,10 +137,16 @@ def createGPU(system, args):
                     num_regs=args.sreg_file_size,
                 )
             )
+            rfcs.append(
+                RegisterFileCache(
+                    simd_id=j, cache_size=args.register_file_cache_size
+                )
+            )
 
         compute_units[-1].wavefronts = wavefronts
         compute_units[-1].vector_register_file = vrfs
         compute_units[-1].scalar_register_file = srfs
+        compute_units[-1].register_file_cache = rfcs
         compute_units[-1].register_manager = RegisterManager(
             policy=args.registerManagerPolicy,
             vrf_pool_managers=vrf_pool_mgrs,
@@ -181,10 +191,14 @@ def connectGPU(system, args):
         system.pc.south_bridge.gpu.DeviceID = 0x740F
         system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
         system.pc.south_bridge.gpu.SubsystemID = 0x0C34
+    elif args.gpu_device == "MI300X":
+        system.pc.south_bridge.gpu.DeviceID = 0x740F
+        system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002
+        system.pc.south_bridge.gpu.SubsystemID = 0x0C34
     elif args.gpu_device == "Vega10":
         system.pc.south_bridge.gpu.DeviceID = 0x6863
     else:
-        panic(f"Unknown GPU device: {args.gpu_device}")
+        m5.util.panic(f"Unknown GPU device: {args.gpu_device}")
 
     # Use the gem5 default of 0x280 OR'd  with 0x10 which tells Linux there is
     # a PCI capabilities list to travse.
diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 2803e10fb4..1322650964 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -108,18 +108,26 @@ def makeGpuFSSystem(args):
     system.cpu.append(shader)
 
     # This arbitrary address is something in the X86 I/O hole
-    hsapp_gpu_map_paddr = 0xE00000000
+    hsapp_gpu_map_paddr = 0xE0000000
     hsapp_pt_walker = VegaPagetableWalker()
     gpu_hsapp = HSAPacketProcessor(
         pioAddr=hsapp_gpu_map_paddr,
         numHWQueues=args.num_hw_queues,
         walker=hsapp_pt_walker,
     )
-    dispatcher_exit_events = True if args.exit_at_gpu_kernel > -1 else False
+    dispatcher_exit_events = False
+    if args.exit_at_gpu_task > -1:
+        dispatcher_exit_events = True
+    if args.exit_after_gpu_kernel > -1:
+        dispatcher_exit_events = True
     dispatcher = GPUDispatcher(kernel_exit_events=dispatcher_exit_events)
     cp_pt_walker = VegaPagetableWalker()
+    target_kernel = args.skip_until_gpu_kernel
     gpu_cmd_proc = GPUCommandProcessor(
-        hsapp=gpu_hsapp, dispatcher=dispatcher, walker=cp_pt_walker
+        hsapp=gpu_hsapp,
+        dispatcher=dispatcher,
+        walker=cp_pt_walker,
+        target_non_blit_kernel_id=target_kernel,
     )
     shader.dispatcher = dispatcher
     shader.gpu_cmd_proc = gpu_cmd_proc
@@ -153,7 +161,7 @@ def makeGpuFSSystem(args):
             0x7D000,
         ]
         sdma_sizes = [0x1000] * 8
-    elif args.gpu_device == "MI200":
+    elif args.gpu_device == "MI200" or args.gpu_device == "MI300X":
         num_sdmas = 5
         sdma_bases = [
             0x4980,
@@ -180,9 +188,15 @@ def makeGpuFSSystem(args):
 
     system.pc.south_bridge.gpu.sdmas = sdma_engines
 
-    # Setup PM4 packet processor
-    pm4_pkt_proc = PM4PacketProcessor()
-    system.pc.south_bridge.gpu.pm4_pkt_proc = pm4_pkt_proc
+    # Setup PM4 packet processors
+    pm4_procs = []
+    pm4_procs.append(
+        PM4PacketProcessor(
+            ip_id=0, mmio_range=AddrRange(start=0xC000, end=0xD000)
+        )
+    )
+
+    system.pc.south_bridge.gpu.pm4_pkt_procs = pm4_procs
 
     # GPU data path
     gpu_mem_mgr = AMDGPUMemoryManager()
@@ -199,7 +213,8 @@ def makeGpuFSSystem(args):
     for sdma in sdma_engines:
         system._dma_ports.append(sdma)
     system._dma_ports.append(device_ih)
-    system._dma_ports.append(pm4_pkt_proc)
+    for pm4_proc in pm4_procs:
+        system._dma_ports.append(pm4_proc)
     system._dma_ports.append(system_hub)
     system._dma_ports.append(gpu_mem_mgr)
     system._dma_ports.append(hsapp_pt_walker)
@@ -213,7 +228,8 @@ def makeGpuFSSystem(args):
     for sdma in sdma_engines:
         sdma.pio = system.iobus.mem_side_ports
     device_ih.pio = system.iobus.mem_side_ports
-    pm4_pkt_proc.pio = system.iobus.mem_side_ports
+    for pm4_proc in pm4_procs:
+        pm4_proc.pio = system.iobus.mem_side_ports
     system_hub.pio = system.iobus.mem_side_ports
 
     # Full system needs special TLBs for SQC, Scalar, and vector data ports
@@ -247,7 +263,7 @@ def makeGpuFSSystem(args):
         0x00000340,
         0x00000000,
         0x00000340,
-        0x0000000F,
+        0x00000000,
         0x00000340,
         0x00000000,
         0x00000000,
@@ -265,7 +281,7 @@ def makeGpuFSSystem(args):
     # See: https://sandpile.org/x86/cpuid.htm#level_0000_0001h
     # Enables AVX, OSXSAVE, XSAVE, POPCNT, SSE4.2, SSE4.1, CMPXCHG16B,
     # and FMA.
-    avx_cpu_features = [0x00020F51, 0x00000805, 0xEFDBFBFF, 0x1C983209]
+    avx_cpu_features = [0x00020F51, 0x00000805, 0xEFDBFBFF, 0x1C803209]
 
     for i, cpu in enumerate(system.cpu):
         # Break once we reach the shader "CPU"
diff --git a/configs/example/gpufs/vega10.py b/configs/example/gpufs/vega10.py
index ae74efd39b..9c3116d415 100644
--- a/configs/example/gpufs/vega10.py
+++ b/configs/example/gpufs/vega10.py
@@ -52,7 +52,7 @@ if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
     echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
     /sbin/m5 exit
 fi
-modprobe -v amdgpu ip_block_mask=0xff ppfeaturemask=0 dpm=0 audio=0
+modprobe -v amdgpu ip_block_mask=0xdf ppfeaturemask=0 dpm=0 audio=0
 echo "Running {} {}"
 echo "{}" | base64 -d > myapp
 chmod +x myapp
diff --git a/configs/example/hsaTopology.py b/configs/example/hsaTopology.py
index 2dcbdeca01..4540293482 100644
--- a/configs/example/hsaTopology.py
+++ b/configs/example/hsaTopology.py
@@ -243,7 +243,7 @@ def createVegaTopology(options):
 
     file_append((node_dir, "properties"), node_prop)
 
-    # Fiji HBM reporting
+    # Vega HBM reporting
     # TODO: Extract size, clk, and width from sim paramters
     mem_dir = joinpath(node_dir, "mem_banks/0")
     remake_dir(mem_dir)
@@ -260,196 +260,7 @@ def createVegaTopology(options):
     file_append((mem_dir, "properties"), mem_prop)
 
 
-# This fakes out a dGPU setup so the runtime correctly operations.  The spoofed
-# system has a single dGPU and a single socket CPU.  Note that more complex
-# topologies (multi-GPU, multi-socket CPUs) need to have a different setup
-# here or the runtime won't be able to issue Memcpies from one node to another.
-#
-# TODO: There is way too much hardcoded here.  It doesn't effect anything in
-# our current ROCm stack (1.6), but it is highly possible that it will in the
-# future.  We might need to scrub through this and extract the appropriate
-# fields from the simulator in the future.
-def createFijiTopology(options):
-    topology_dir = joinpath(
-        m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology"
-    )
-    remake_dir(topology_dir)
-
-    amdgpu_dir = joinpath(m5.options.outdir, "fs/sys/module/amdgpu/parameters")
-    remake_dir(amdgpu_dir)
-
-    # Fiji reported VM size in GB.  Used to reserve an allocation from CPU
-    # to implement SVM (i.e. GPUVM64 pointers and X86 pointers agree)
-    file_append((amdgpu_dir, "vm_size"), 256)
-
-    # Ripped from real Fiji platform to appease KMT version checks
-    file_append((topology_dir, "generation_id"), 2)
-
-    # Set up system properties.  Regiter as ast-rocm server
-    sys_prop = (
-        "platform_oem 35498446626881\n"
-        + "platform_id 71791775140929\n"
-        + "platform_rev 2\n"
-    )
-    file_append((topology_dir, "system_properties"), sys_prop)
-
-    # Populate the topology tree
-    # Our dGPU system is two nodes.  Node 0 is a CPU and Node 1 is a dGPU
-    node_dir = joinpath(topology_dir, "nodes/0")
-    remake_dir(node_dir)
-
-    # Register as a CPU
-    file_append((node_dir, "gpu_id"), 0)
-    file_append((node_dir, "name"), "")
-
-    # CPU links.  Only thing that matters is we tell the runtime that GPU is
-    # connected through PCIe to CPU socket 0.
-    io_links = 1
-    io_dir = joinpath(node_dir, "io_links/0")
-    remake_dir(io_dir)
-    io_prop = (
-        "type 2\n"
-        + "version_major 0\n"
-        + "version_minor 0\n"
-        + "node_from 0\n"
-        + "node_to 1\n"
-        + "weight 20\n"
-        + "min_latency 0\n"
-        + "max_latency 0\n"
-        + "min_bandwidth 0\n"
-        + "max_bandwidth 0\n"
-        + "recommended_transfer_size 0\n"
-        + "flags 13\n"
-    )
-    file_append((io_dir, "properties"), io_prop)
-
-    # Populate CPU node properties
-    node_prop = (
-        f"cpu_cores_count {options.num_cpus}\n"
-        + "simd_count 0\n"
-        + "mem_banks_count 1\n"
-        + "caches_count 0\n"
-        + f"io_links_count {io_links}\n"
-        + "cpu_core_id_base 0\n"
-        + "simd_id_base 0\n"
-        + "max_waves_per_simd 0\n"
-        + "lds_size_in_kb 0\n"
-        + "gds_size_in_kb 0\n"
-        + "wave_front_size 64\n"
-        + "array_count 0\n"
-        + "simd_arrays_per_engine 0\n"
-        + "cu_per_simd_array 0\n"
-        + "simd_per_cu 0\n"
-        + "max_slots_scratch_cu 0\n"
-        + "vendor_id 0\n"
-        + "device_id 0\n"
-        + "location_id 0\n"
-        + "drm_render_minor 0\n"
-        + "max_engine_clk_ccompute 3400\n"
-    )
-
-    file_append((node_dir, "properties"), node_prop)
-
-    # CPU memory reporting
-    mem_dir = joinpath(node_dir, "mem_banks/0")
-    remake_dir(mem_dir)
-    # Heap type value taken from real system, heap type values:
-    # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317
-    mem_prop = (
-        "heap_type 0\n"
-        + "size_in_bytes 33704329216\n"
-        + "flags 0\n"
-        + "width 72\n"
-        + "mem_clk_max 2400\n"
-    )
-
-    file_append((mem_dir, "properties"), mem_prop)
-
-    # Build the GPU node
-    node_dir = joinpath(topology_dir, "nodes/1")
-    remake_dir(node_dir)
-
-    # Register as a Fiji
-    file_append((node_dir, "gpu_id"), 50156)
-    file_append((node_dir, "name"), "Fiji\n")
-
-    # Should be the same as the render driver filename (dri/renderD<drm_num>)
-    drm_num = 128
-
-    # Real Fiji shows 96, but building that topology is complex and doesn't
-    # appear to be required for anything.
-    caches = 0
-
-    # GPU links.  Only thing that matters is we tell the runtime that GPU is
-    # connected through PCIe to CPU socket 0.
-    io_links = 1
-    io_dir = joinpath(node_dir, "io_links/0")
-    remake_dir(io_dir)
-    io_prop = (
-        "type 2\n"
-        + "version_major 0\n"
-        + "version_minor 0\n"
-        + "node_from 1\n"
-        + "node_to 0\n"
-        + "weight 20\n"
-        + "min_latency 0\n"
-        + "max_latency 0\n"
-        + "min_bandwidth 0\n"
-        + "max_bandwidth 0\n"
-        + "recommended_transfer_size 0\n"
-        + "flags 1\n"
-    )
-    file_append((io_dir, "properties"), io_prop)
-
-    # Populate GPU node properties
-    node_prop = (
-        "cpu_cores_count 0\n"
-        + f"simd_count {options.num_compute_units * options.simds_per_cu}\n"
-        + "mem_banks_count 1\n"
-        + f"caches_count {caches}\n"
-        + f"io_links_count {io_links}\n"
-        + "cpu_core_id_base 0\n"
-        + "simd_id_base 2147487744\n"
-        + f"max_waves_per_simd {options.wfs_per_simd}\n"
-        + f"lds_size_in_kb {int(options.lds_size / 1024)}\n"
-        + "gds_size_in_kb 0\n"
-        + f"wave_front_size {options.wf_size}\n"
-        + "array_count 4\n"
-        + f"simd_arrays_per_engine {options.sa_per_complex}\n"
-        + f"cu_per_simd_array {options.cu_per_sa}\n"
-        + f"simd_per_cu {options.simds_per_cu}\n"
-        + "max_slots_scratch_cu 32\n"
-        + "vendor_id 4098\n"
-        + "device_id 29440\n"
-        + "location_id 512\n"
-        + f"drm_render_minor {drm_num}\n"
-        + f"max_engine_clk_fcompute {int(toFrequency(options.gpu_clock) / 1000000.0)}\n"
-        + "local_mem_size 4294967296\n"
-        + "fw_version 730\n"
-        + "capability 4736\n"
-        + f"max_engine_clk_ccompute {int(toFrequency(options.CPUClock) / 1000000.0)}\n"
-    )
-
-    file_append((node_dir, "properties"), node_prop)
-
-    # Fiji HBM reporting
-    # TODO: Extract size, clk, and width from sim paramters
-    mem_dir = joinpath(node_dir, "mem_banks/0")
-    remake_dir(mem_dir)
-    # Heap type value taken from real system, heap type values:
-    # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317
-    mem_prop = (
-        "heap_type 1\n"
-        + "size_in_bytes 4294967296\n"
-        + "flags 0\n"
-        + "width 4096\n"
-        + "mem_clk_max 500\n"
-    )
-
-    file_append((mem_dir, "properties"), mem_prop)
-
-
-def createCarrizoTopology(options):
+def createRavenTopology(options):
     topology_dir = joinpath(
         m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology"
     )
@@ -476,7 +287,6 @@ def createCarrizoTopology(options):
     file_append((node_dir, "gpu_id"), 2765)
 
     gfx_dict = {
-        "gfx801": {"name": "Carrizo\n", "id": 39028},
         "gfx902": {"name": "Raven\n", "id": 5597},
     }
 
diff --git a/configs/example/lupv/run_lupv.py b/configs/example/lupv/run_lupv.py
index 4be6b924a5..6dc5d3526f 100644
--- a/configs/example/lupv/run_lupv.py
+++ b/configs/example/lupv/run_lupv.py
@@ -49,8 +49,8 @@ from gem5.utils.requires import requires
 # Run a check to ensure the right version of gem5 is being used.
 requires(isa_required=ISA.RISCV)
 
-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )
 
 parser = argparse.ArgumentParser(description="Runs Linux fs test with RISCV.")
@@ -72,7 +72,7 @@ parser.add_argument(
 
 args = parser.parse_args()
 
-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
     l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
 )
 
@@ -98,8 +98,12 @@ board = LupvBoard(
 # Set the Full System workload.
 
 board.set_kernel_disk_workload(
-    kernel=obtain_resource("riscv-lupio-linux-kernel"),
-    disk_image=obtain_resource("riscv-lupio-busybox-img"),
+    kernel=obtain_resource(
+        "riscv-lupio-linux-kernel", resource_version="1.0.0"
+    ),
+    disk_image=obtain_resource(
+        "riscv-lupio-busybox-img", resource_version="1.0.0"
+    ),
 )
 
 
diff --git a/configs/example/riscv/fs_linux.py b/configs/example/riscv/fs_linux.py
index c0643c883d..246a1b24d9 100644
--- a/configs/example/riscv/fs_linux.py
+++ b/configs/example/riscv/fs_linux.py
@@ -145,7 +145,17 @@ Options.addFSOptions(parser)
 parser.add_argument(
     "--virtio-rng", action="store_true", help="Enable VirtIORng device"
 )
-
+parser.add_argument(
+    "--semihosting",
+    action="store_true",
+    help="Enable the RISC-V semihosting interface",
+)
+parser.add_argument(
+    "--semihosting-root",
+    default="/some/invalid/root/directory",
+    type=str,
+    help="The root directory for files exposed to semihosting",
+)
 # ---------------------------- Parse Options --------------------------- #
 args = parser.parse_args()
 
@@ -168,11 +178,17 @@ mdesc = SysConfig(
 system.mem_mode = mem_mode
 system.mem_ranges = [AddrRange(start=0x80000000, size=mdesc.mem())]
 
+workload_args = dict()
+if args.semihosting:
+    workload_args["semihosting"] = RiscvSemihosting(
+        files_root_dir=args.semihosting_root,
+        cmd_line=args.kernel,
+    )
 if args.bare_metal:
-    system.workload = RiscvBareMetal()
+    system.workload = RiscvBareMetal(**workload_args)
     system.workload.bootloader = args.kernel
 else:
-    system.workload = RiscvLinux()
+    system.workload = RiscvLinux(**workload_args)
     system.workload.object_file = args.kernel
 
 system.iobus = IOXBar()
diff --git a/configs/nvm/sweep.py b/configs/nvm/sweep.py
index d5d23ad76a..ab77768e08 100644
--- a/configs/nvm/sweep.py
+++ b/configs/nvm/sweep.py
@@ -59,7 +59,7 @@ nvm_generators = {"NVM": lambda x: x.createNvm}
 
 # Use a single-channel DDR3-1600 x64 (8x8 topology) by default
 parser.add_argument(
-    "--nvm-type",
+    "--mem-type",
     default="NVM_2400_1x64",
     choices=ObjectList.mem_list.get_names(),
     help="type of memory to use",
@@ -212,7 +212,7 @@ def trace():
                 nbr_banks,
                 bank,
                 addr_map,
-                args.dram_ranks,
+                args.nvm_ranks,
             )
     yield system.tgen.createExit(0)
 
diff --git a/configs/nvm/sweep_hybrid.py b/configs/nvm/sweep_hybrid.py
index 669f847eb1..82a4a6124e 100644
--- a/configs/nvm/sweep_hybrid.py
+++ b/configs/nvm/sweep_hybrid.py
@@ -143,7 +143,7 @@ MemConfig.config_mem(args, system)
 
 # the following assumes that we are using the native controller
 # with NVM and DRAM interfaces, check to be sure
-if not isinstance(system.mem_ctrls[0], m5.objects.HeteroMemCtrl):
+if not isinstance(system.mem_ctrls[0], m5.objects.MemCtrl):
     fatal("This script assumes the controller is a HeteroMemCtrl subclass")
 if not isinstance(system.mem_ctrls[0].dram, m5.objects.DRAMInterface):
     fatal("This script assumes the first memory is a DRAMInterface subclass")
diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py
index 0818b7f0eb..c10ccac647 100644
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -149,7 +149,8 @@ class TCPCache(RubyCache):
         self.size = MemorySize(options.tcp_size)
         self.assoc = options.tcp_assoc
         self.resourceStalls = options.no_tcc_resource_stalls
-        self.replacement_policy = TreePLRURP()
+        if hasattr(options, "tcp_rp"):
+            self.replacement_policy = RP_choose(options.tcp_rp)
 
 
 class TCPCntrl(TCP_Controller, CntrlBase):
@@ -241,7 +242,8 @@ class SQCCache(RubyCache):
     def create(self, options):
         self.size = MemorySize(options.sqc_size)
         self.assoc = options.sqc_assoc
-        self.replacement_policy = TreePLRURP()
+        if hasattr(options, "sqc_rp"):
+            self.replacement_policy = RP_choose(options.sqc_rp)
 
 
 class SQCCntrl(SQC_Controller, CntrlBase):
@@ -303,7 +305,8 @@ class TCC(RubyCache):
         self.start_index_bit = math.log(options.cacheline_size, 2) + math.log(
             options.num_tccs, 2
         )
-        self.replacement_policy = TreePLRURP()
+        if hasattr(options, "tcc_rp"):
+            self.replacement_policy = RP_choose(options.tcc_rp)
 
 
 class TCCCntrl(TCC_Controller, CntrlBase):
@@ -497,13 +500,6 @@ def define_options(parser):
     parser.add_argument(
         "--noL1", action="store_true", default=False, help="bypassL1"
     )
-    parser.add_argument(
-        "--scalar-buffer-size",
-        type=int,
-        default=128,
-        help="Size of the mandatory queue in the GPU scalar "
-        "cache controller",
-    )
     parser.add_argument(
         "--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency"
     )
@@ -841,9 +837,7 @@ def construct_scalars(options, system, ruby_system, network):
         scalar_cntrl.responseToSQC = MessageBuffer(ordered=True)
         scalar_cntrl.responseToSQC.in_port = network.out_port
 
-        scalar_cntrl.mandatoryQueue = MessageBuffer(
-            buffer_size=options.scalar_buffer_size
-        )
+        scalar_cntrl.mandatoryQueue = MessageBuffer()
 
     return (scalar_sequencers, scalar_cntrl_nodes)
 
@@ -1133,3 +1127,28 @@ def create_system(
     ruby_system.network.number_of_virtual_networks = 11
 
     return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
+
+
+def RP_choose(test_name):
+    if test_name == "TreePLRURP":
+        replacement_policy = TreePLRURP()
+    elif test_name == "LRURP":
+        replacement_policy = LRURP()
+    elif test_name == "FIFORP":
+        replacement_policy = FIFORP()
+    elif test_name == "LFURP":
+        replacement_policy = LFURP()
+    elif test_name == "LIPRP":
+        replacement_policy = LIPRP()
+    elif test_name == "MRURP":
+        replacement_policy = MRURP()
+    elif test_name == "NRURP":
+        replacement_policy = NRURP()
+    elif test_name == "RRIPRP":
+        replacement_policy = RRIPRP()
+    elif test_name == "SecondChanceRP":
+        replacement_policy = SecondChanceRP()
+    elif test_name == "SHiPMemRP":
+        replacement_policy = SHiPMemRP()
+
+    return replacement_policy
diff --git a/ext/softfloat/softfloat_types.h b/ext/softfloat/softfloat_types.h
index af1888f9b9..5123cd39c6 100644
--- a/ext/softfloat/softfloat_types.h
+++ b/ext/softfloat/softfloat_types.h
@@ -47,6 +47,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | the types below may, if desired, be defined as aliases for the native types
 | (typically 'float' and 'double', and possibly 'long double').
 *----------------------------------------------------------------------------*/
+typedef struct { uint8_t  v; } float8_t;
 typedef struct { uint16_t v; } float16_t;
 typedef struct { uint32_t v; } float32_t;
 typedef struct { uint64_t v; } float64_t;
diff --git a/ext/systemc/SConscript b/ext/systemc/SConscript
index 5248fc32d9..d6c3f3f135 100644
--- a/ext/systemc/SConscript
+++ b/ext/systemc/SConscript
@@ -25,13 +25,16 @@
 
 import os
 from m5.util.terminal import get_termcap
-import gem5_scons
+import sys
 
 Import('env')
 systemc = env.Clone()
 
 build_root = Dir('.').abspath
 src_root = Dir('.').srcdir.abspath
+gem5_root = Dir('#../..').srcnode().abspath
+sys.path.append(os.path.join(gem5_root, 'site_scons'))
+import gem5_scons
 
 systemc.Prepend(CPPPATH=Dir('./src').srcnode())
 systemc.Prepend(CPATH=Dir('./src'))
diff --git a/ext/testlib/configuration.py b/ext/testlib/configuration.py
index 60c0c17654..cebf493add 100644
--- a/ext/testlib/configuration.py
+++ b/ext/testlib/configuration.py
@@ -245,7 +245,6 @@ def define_constants(constants):
 
     constants.isa_tag_type = "isa"
     constants.x86_tag = "X86"
-    constants.gcn3_x86_tag = "GCN3_X86"
     constants.vega_x86_tag = "VEGA_X86"
     constants.sparc_tag = "SPARC"
     constants.riscv_tag = "RISCV"
@@ -274,7 +273,6 @@ def define_constants(constants):
     constants.supported_tags = {
         constants.isa_tag_type: (
             constants.x86_tag,
-            constants.gcn3_x86_tag,
             constants.vega_x86_tag,
             constants.sparc_tag,
             constants.riscv_tag,
@@ -305,7 +303,6 @@ def define_constants(constants):
     constants.target_host = {
         constants.arm_tag: (constants.host_arm_tag,),
         constants.x86_tag: (constants.host_x86_64_tag,),
-        constants.gcn3_x86_tag: (constants.host_x86_64_tag,),
         constants.vega_x86_tag: (constants.host_x86_64_tag,),
         constants.sparc_tag: (constants.host_x86_64_tag,),
         constants.riscv_tag: (constants.host_x86_64_tag,),
diff --git a/optional-requirements.txt b/optional-requirements.txt
index f88787df1f..d69c960fa1 100644
--- a/optional-requirements.txt
+++ b/optional-requirements.txt
@@ -1 +1 @@
-tqdm==4.64.1
+tqdm==4.66.4
diff --git a/requirements.txt b/requirements.txt
index 4b820f51ba..9a1748f82a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
-mypy==1.5.1
-pre-commit==2.20.0
+mypy==1.10.0
+pre-commit==3.7.1
diff --git a/site_scons/gem5_scons/configure.py b/site_scons/gem5_scons/configure.py
index c1b9fb56cc..c1d3f8f0e0 100644
--- a/site_scons/gem5_scons/configure.py
+++ b/site_scons/gem5_scons/configure.py
@@ -59,13 +59,15 @@ def CheckCxxFlag(context, flag, autoadd=True):
     return ret
 
 
-def CheckLinkFlag(context, flag, autoadd=True, set_for_shared=True):
+def CheckLinkFlag(context, flag, autoadd=True, set_for_shared=True, code=None):
     context.Message(f"Checking for linker {flag} support... ")
     last_linkflags = context.env["LINKFLAGS"]
     context.env.Append(LINKFLAGS=[flag])
     pre_werror = context.env["LINKFLAGS"]
     context.env.Append(LINKFLAGS=["-Werror"])
-    ret = context.TryLink("int main(int, char *[]) { return 0; }", ".cc")
+    if not code:
+        code = "int main(int, char *[]) { return 0; }"
+    ret = context.TryLink(code, ".cc")
     context.env["LINKFLAGS"] = pre_werror
     if not (ret and autoadd):
         context.env["LINKFLAGS"] = last_linkflags
diff --git a/src/Doxyfile b/src/Doxyfile
index 68d9b3b44b..2206f17669 100644
--- a/src/Doxyfile
+++ b/src/Doxyfile
@@ -31,7 +31,7 @@ PROJECT_NAME           = gem5
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = v23.1.0.0
+PROJECT_NUMBER         = v24.0.0.0
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/src/Kconfig b/src/Kconfig
index 2d24aad1ad..268f7ca1c5 100644
--- a/src/Kconfig
+++ b/src/Kconfig
@@ -51,3 +51,4 @@ rsource "arch/Kconfig"
 rsource "cpu/Kconfig"
 rsource "systemc/Kconfig"
 rsource "gpu-compute/Kconfig"
+rsource "test_objects/Kconfig"
diff --git a/src/arch/SConscript b/src/arch/SConscript
index 2426401d73..0607c7a47c 100644
--- a/src/arch/SConscript
+++ b/src/arch/SConscript
@@ -68,7 +68,7 @@ if env['CONF']['BUILD_ISA']:
         error("At least one ISA need to be set")
 
 
-amdgpu_isa = ['gcn3', 'vega']
+amdgpu_isa = ['vega']
 
 if env['CONF']['BUILD_GPU']:
     env.SwitchingHeaders(
diff --git a/src/arch/amdgpu/Kconfig b/src/arch/amdgpu/Kconfig
index 5140f2b103..38c3533eb8 100644
--- a/src/arch/amdgpu/Kconfig
+++ b/src/arch/amdgpu/Kconfig
@@ -29,5 +29,4 @@ prompt "GPU ISA"
 endchoice
 endif
 
-rsource "gcn3/Kconfig"
 rsource "vega/Kconfig"
diff --git a/src/arch/amdgpu/common/SConscript b/src/arch/amdgpu/common/SConscript
index ffa5fcb5da..82f9f01d77 100644
--- a/src/arch/amdgpu/common/SConscript
+++ b/src/arch/amdgpu/common/SConscript
@@ -34,7 +34,7 @@ Import('*')
 if not env['CONF']['BUILD_GPU']:
     Return()
 
-if env['CONF']['TARGET_GPU_ISA'] in ('gcn3', 'vega'):
+if env['CONF']['TARGET_GPU_ISA'] in ('vega'):
     SimObject('X86GPUTLB.py', sim_objects=['X86GPUTLB', 'TLBCoalescer'])
 
     Source('tlb.cc')
diff --git a/src/arch/amdgpu/common/dtype/README.md b/src/arch/amdgpu/common/dtype/README.md
new file mode 100644
index 0000000000..02f1964fdb
--- /dev/null
+++ b/src/arch/amdgpu/common/dtype/README.md
@@ -0,0 +1,21 @@
+# Microscaling Formats
+
+This directory defines [microscaling formats](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) which are reduced precision floating point formats.
+The class makes some assumptions to simplify things and is not completely generic.
+For example:
+- Types must be smaller than 32-bits.
+- Type conversions currently assume that either:
+    - The destination format exponent and mantissa bits are both greater or equal to the source format.
+    - OR the destination format exponent and mantissa are both less than or equal to the source format.
+    - In other words, one type cannot have larger exponent and smaller mantissa and visa versa.
+- Basic MX operations are implementation defined, meaning MX types can be converted to FP32 for arithmetic
+    - This means that arithmetic operators need not be defined for MX types.
+- Exponent and mantissa of zero is zero. There is no special case for the sign (i.e, -0 is not special).
+- The spec does not differentiate between signaling and quiet NaN, therefore quiet NaN is used.
+- New types must template specialize the following standard library methods:
+    - isinf(T)
+    - isnan(T)
+    - isnormal(T)
+- New types must template specialize the following std::numeric_limits<T> members / methods:
+    - has_infinity / infinity()
+    - has_quiet_NaN / quiet_NaN()
diff --git a/src/arch/amdgpu/gcn3/SConsopts b/src/arch/amdgpu/common/dtype/SConscript
similarity index 92%
rename from src/arch/amdgpu/gcn3/SConsopts
rename to src/arch/amdgpu/common/dtype/SConscript
index edccf603fa..67a3849651 100644
--- a/src/arch/amdgpu/gcn3/SConsopts
+++ b/src/arch/amdgpu/common/dtype/SConscript
@@ -1,6 +1,4 @@
-# -*- mode:python -*-
-
-# Copyright (c) 2015, 2017 Advanced Micro Devices, Inc.
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -31,4 +29,4 @@
 
 Import('*')
 
-main.Append(ALL_GPU_ISAS=['gcn3'])
+GTest('mxfp.test', 'mxfp.test.cc')
diff --git a/src/arch/amdgpu/common/dtype/binary32.hh b/src/arch/amdgpu/common/dtype/binary32.hh
new file mode 100644
index 0000000000..441eed57ca
--- /dev/null
+++ b/src/arch/amdgpu/common/dtype/binary32.hh
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_BINARY32_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_BINARY32_HH__
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+// Same as IEEE 754 binary 32 - Microscaling types are converted to/from
+// this format by default. For now as there do not seem to be any MI300
+// instructions operating directly on the types (i.e., they all cast to FP32
+// first and then perform arithmetic operations).
+typedef union binary32_u
+{
+    enum bitSizes
+    {
+        ebits = 8,
+        mbits = 23,
+        sbits = 1,
+        bias = 127,
+
+        inf = 0x7f800000,
+        nan = 0x7f800100,
+        max = 0x7f7fffff
+    };
+
+    uint32_t storage;
+    float    fp32;
+    struct
+    {
+        unsigned mant : 23;
+        unsigned exp  : 8;
+        unsigned sign : 1;
+    };
+
+    // To help with stdlib functions with T = float.
+    operator float() const
+    {
+        return fp32;
+    }
+} binary32;
+static_assert(sizeof(binary32) == 4);
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+namespace std
+{
+
+template<>
+class numeric_limits<gem5::AMDGPU::binary32>
+{
+  public:
+    static constexpr bool has_quiet_NaN = true;
+    static gem5::AMDGPU::binary32 quiet_NaN()
+    {
+        gem5::AMDGPU::binary32 tmp;
+        tmp.fp32 = std::numeric_limits<float>::quiet_NaN();
+        return tmp;
+    }
+
+    static constexpr bool has_infinity = true;
+    static gem5::AMDGPU::binary32 infinity()
+    {
+        gem5::AMDGPU::binary32 tmp;
+        tmp.fp32 = std::numeric_limits<float>::infinity();
+        return tmp;
+    }
+
+    static gem5::AMDGPU::binary32 max()
+    {
+        gem5::AMDGPU::binary32 tmp;
+        tmp.fp32 = std::numeric_limits<float>::max();
+        return tmp;
+    }
+};
+
+} // namespace std
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_BINARY32_HH__
diff --git a/src/arch/amdgpu/common/dtype/fp16_e5m10.hh b/src/arch/amdgpu/common/dtype/fp16_e5m10.hh
new file mode 100644
index 0000000000..363dcada12
--- /dev/null
+++ b/src/arch/amdgpu/common/dtype/fp16_e5m10.hh
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP16_E5M10_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_FP16_E5M10_HH__
+
+#include <cassert>
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+typedef union
+{
+    enum bitSizes
+    {
+        ebits = 5,
+        mbits = 10,
+        sbits = 1,
+        zbits = 16,
+        bias = 15,
+
+        inf = 0x7c000000,
+        nan = 0x7c100000,
+        max = 0x7bff0000
+    };
+
+    uint32_t storage;
+    struct
+    {
+        unsigned zero : zbits;
+        unsigned mant : mbits;
+        unsigned exp  : ebits;
+        unsigned sign : sbits;
+    };
+} fp16_e5m10_info;
+static_assert(sizeof(fp16_e5m10_info) == 4);
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+
+// std library cmath definitions
+namespace std
+{
+
+constexpr bool isinf(gem5::AMDGPU::fp16_e5m10_info a)
+{
+    return a.exp == 0x1F && a.mant == 0;
+}
+
+constexpr bool isnan(gem5::AMDGPU::fp16_e5m10_info a)
+{
+    return a.exp == 0x1F && a.mant != 0;
+}
+
+constexpr bool isnormal(gem5::AMDGPU::fp16_e5m10_info a)
+{
+    return !(a.exp == 0 && a.mant != 0);
+}
+
+template<>
+class numeric_limits<gem5::AMDGPU::fp16_e5m10_info>
+{
+  public:
+    static constexpr bool has_quiet_NaN = true;
+    static gem5::AMDGPU::fp16_e5m10_info quiet_NaN()
+    {
+        assert(has_quiet_NaN);
+        gem5::AMDGPU::fp16_e5m10_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e5m10_info::nan;
+        return tmp;
+    }
+
+    static constexpr bool has_infinity = true;
+    static gem5::AMDGPU::fp16_e5m10_info infinity()
+    {
+        assert(has_infinity);
+        gem5::AMDGPU::fp16_e5m10_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e5m10_info::inf;
+        return tmp;
+    }
+
+    static gem5::AMDGPU::fp16_e5m10_info max()
+    {
+        gem5::AMDGPU::fp16_e5m10_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e5m10_info::max;
+        return tmp;
+    }
+};
+
+} // namespace std
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP16_E5M10_HH__
diff --git a/src/arch/amdgpu/common/dtype/fp16_e8m7.hh b/src/arch/amdgpu/common/dtype/fp16_e8m7.hh
new file mode 100644
index 0000000000..3c796fca51
--- /dev/null
+++ b/src/arch/amdgpu/common/dtype/fp16_e8m7.hh
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP16_E8M7_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_FP16_E8M7_HH__
+
+#include <cassert>
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+typedef union
+{
+    enum bitSizes
+    {
+        ebits = 8,
+        mbits = 7,
+        sbits = 1,
+        zbits = 16,
+        bias = 127,
+
+        inf = 0x7f800000,
+        nan = 0x7f810000,
+        max = 0x7f7f0000
+    };
+
+    uint32_t storage;
+    struct
+    {
+        unsigned zero : zbits;
+        unsigned mant : mbits;
+        unsigned exp  : ebits;
+        unsigned sign : sbits;
+    };
+} fp16_e8m7_info;
+static_assert(sizeof(fp16_e8m7_info) == 4);
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+
+// std library cmath definitions
+namespace std
+{
+
+constexpr bool isinf(gem5::AMDGPU::fp16_e8m7_info a)
+{
+    return a.exp == 0xFF && a.mant == 0;
+}
+
+constexpr bool isnan(gem5::AMDGPU::fp16_e8m7_info a)
+{
+    return a.exp == 0xFF && a.mant != 0;
+}
+
+constexpr bool isnormal(gem5::AMDGPU::fp16_e8m7_info a)
+{
+    return !(a.exp == 0 && a.mant != 0);
+}
+
+template<>
+class numeric_limits<gem5::AMDGPU::fp16_e8m7_info>
+{
+  public:
+    static constexpr bool has_quiet_NaN = true;
+    static gem5::AMDGPU::fp16_e8m7_info quiet_NaN()
+    {
+        assert(has_quiet_NaN);
+        gem5::AMDGPU::fp16_e8m7_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e8m7_info::nan;
+        return tmp;
+    }
+
+    static constexpr bool has_infinity = true;
+    static gem5::AMDGPU::fp16_e8m7_info infinity()
+    {
+        assert(has_infinity);
+        gem5::AMDGPU::fp16_e8m7_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e8m7_info::inf;
+        return tmp;
+    }
+
+    static gem5::AMDGPU::fp16_e8m7_info max()
+    {
+        gem5::AMDGPU::fp16_e8m7_info tmp;
+        tmp.storage = gem5::AMDGPU::fp16_e8m7_info::max;
+        return tmp;
+    }
+};
+
+} // namespace std
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP16_E8M7_HH__
diff --git a/src/arch/amdgpu/common/dtype/fp8_e4m3.hh b/src/arch/amdgpu/common/dtype/fp8_e4m3.hh
new file mode 100644
index 0000000000..46d2685c00
--- /dev/null
+++ b/src/arch/amdgpu/common/dtype/fp8_e4m3.hh
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP8_E4M3_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_FP8_E4M3_HH__
+
+#include <cassert>
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+typedef union
+{
+    enum bitSizes
+    {
+        ebits = 4,
+        mbits = 3,
+        sbits = 1,
+        zbits = 24,
+        bias = 7,
+
+        inf = (0x7f << zbits),
+        nan = (0xff << zbits),
+        max = (0x7f << zbits)
+    };
+
+    uint32_t storage;
+    struct
+    {
+        unsigned zero : zbits;
+        unsigned mant : mbits;
+        unsigned exp  : ebits;
+        unsigned sign : sbits;
+    };
+} fp8_e4m3_info;
+static_assert(sizeof(fp8_e4m3_info) == 4);
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+
+// std library cmath definitions
+namespace std
+{
+
+// Inf not defined
+constexpr bool isinf(gem5::AMDGPU::fp8_e4m3_info a) { return false; }
+
+constexpr bool isnan(gem5::AMDGPU::fp8_e4m3_info a)
+{
+    return a.exp == 0xF && a.mant == 0x7;
+}
+
+constexpr bool isnormal(gem5::AMDGPU::fp8_e4m3_info a)
+{
+    return !(a.exp == 0 && a.mant != 0);
+}
+
+
+template<>
+class numeric_limits<gem5::AMDGPU::fp8_e4m3_info>
+{
+  public:
+    static constexpr bool has_quiet_NaN = true;
+    static gem5::AMDGPU::fp8_e4m3_info quiet_NaN()
+    {
+        assert(has_quiet_NaN);
+        gem5::AMDGPU::fp8_e4m3_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e4m3_info::nan;
+        return tmp;
+    }
+
+    static constexpr bool has_infinity = false;
+    static gem5::AMDGPU::fp8_e4m3_info infinity()
+    {
+        assert(has_infinity);
+        gem5::AMDGPU::fp8_e4m3_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e4m3_info::inf;
+        return tmp;
+    }
+
+    static gem5::AMDGPU::fp8_e4m3_info max()
+    {
+        gem5::AMDGPU::fp8_e4m3_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e4m3_info::max;
+        return tmp;
+    }
+};
+
+} // namespace std
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP8_E4M3_HH__
diff --git a/src/arch/amdgpu/common/dtype/fp8_e5m2.hh b/src/arch/amdgpu/common/dtype/fp8_e5m2.hh
new file mode 100644
index 0000000000..9e1f5812d5
--- /dev/null
+++ b/src/arch/amdgpu/common/dtype/fp8_e5m2.hh
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_FP8_E5M2_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_FP8_E5M2_HH__
+
+#include <cassert>
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+typedef union
+{
+    enum bitSizes
+    {
+        ebits = 5,
+        mbits = 2,
+        sbits = 1,
+        zbits = 24,
+        bias = 15,
+
+        inf = (0x7c << zbits),
+        nan = (0xff << zbits),
+        max = (0x7f << zbits)
+    };
+
+    uint32_t storage;
+    struct
+    {
+        unsigned zero : zbits;
+        unsigned mant : mbits;
+        unsigned exp  : ebits;
+        unsigned sign : sbits;
+    };
+} fp8_e5m2_info;
+static_assert(sizeof(fp8_e5m2_info) == 4);
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+
+// std library cmath definitions
+namespace std
+{
+
+constexpr bool isinf(gem5::AMDGPU::fp8_e5m2_info a)
+{
+    return a.exp == 0x1F && a.mant == 0x0;
+}
+
+constexpr bool isnan(gem5::AMDGPU::fp8_e5m2_info a)
+{
+    return a.exp == 0x1F && a.mant != 0x0;
+}
+
+constexpr bool isnormal(gem5::AMDGPU::fp8_e5m2_info a)
+{
+    return !(a.exp == 0 && a.mant != 0);
+}
+
+template<>
+class numeric_limits<gem5::AMDGPU::fp8_e5m2_info>
+{
+  public:
+    static constexpr bool has_quiet_NaN = true;
+    static gem5::AMDGPU::fp8_e5m2_info quiet_NaN()
+    {
+        assert(has_quiet_NaN);
+        gem5::AMDGPU::fp8_e5m2_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e5m2_info::nan;
+        return tmp;
+    }
+
+    static constexpr bool has_infinity = true;
+    static gem5::AMDGPU::fp8_e5m2_info infinity()
+    {
+        assert(has_infinity);
+        gem5::AMDGPU::fp8_e5m2_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e5m2_info::inf;
+        return tmp;
+    }
+
+    static gem5::AMDGPU::fp8_e5m2_info max()
+    {
+        gem5::AMDGPU::fp8_e5m2_info tmp;
+        tmp.storage = gem5::AMDGPU::fp8_e5m2_info::max;
+        return tmp;
+    }
+};
+
+} // namespace std
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_FP8_E5M2_HH__
diff --git a/src/arch/amdgpu/common/dtype/mxfp.hh b/src/arch/amdgpu/common/dtype/mxfp.hh
new file mode 100644
index 0000000000..d7edb32dbf
--- /dev/null
+++ b/src/arch/amdgpu/common/dtype/mxfp.hh
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_HH__
+
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+
+#include "arch/amdgpu/common/dtype/mxfp_convert.hh"
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+// Base class for all microscaling types. The sizes of everything are
+// determined by the enum fields in the FMT struct. All of these share the
+// same operator overloads which convert to float before arithmetic and
+// convert back if assigned to a microscaling type.
+template<typename FMT>
+class mxfp
+{
+  public:
+    mxfp() = default;
+    mxfp(float f) : mode(roundTiesToEven)
+    {
+        data = float_to_mxfp(f);
+    }
+
+    // Set raw bits, used by gem5 to set a raw value read from VGPRs.
+    mxfp(const uint32_t& raw)
+    {
+        // The info unions end up being "left" aligned. For example, in FP4
+        // only the bits 31:28 are used. Shift the input by the storage size
+        // of 32 by the type size (sign + exponent + mantissa bits).
+        data = raw;
+        data <<= (32 - int(FMT::sbits) - int(FMT::ebits) - int(FMT::mbits));
+    }
+
+    mxfp(const mxfp& f)
+    {
+        FMT conv_out;
+        conv_out = convertMXFP<FMT, decltype(f.getFmt())>(f.getFmt());
+        data = conv_out.storage;
+    }
+
+    mxfp&
+    operator=(const float& f)
+    {
+       data = float_to_mxfp(f);
+       return *this;
+    }
+
+    mxfp&
+    operator=(const mxfp& f)
+    {
+        FMT conv_out;
+        conv_out = convertMXFP<FMT, decltype(f.getFmt())>(f.getFmt());
+        data = conv_out.storage;
+        return *this;
+    }
+
+    operator float() const
+    {
+        binary32 out;
+        FMT in;
+        in.storage = data;
+        out = convertMXFP<binary32, FMT>(in, mode);
+
+        return out.fp32;
+    }
+
+    constexpr static int
+    size()
+    {
+        return int(FMT::mbits) + int(FMT::ebits) + int(FMT::sbits);
+    }
+
+    // Intentionally use storage > size() so that a storage type is not needed
+    // as a template parameter.
+    uint32_t data = 0;
+
+    FMT
+    getFmt() const
+    {
+        FMT out;
+        out.storage = data;
+        return out;
+    }
+
+    void
+    setFmt(FMT in)
+    {
+        data = in.storage;
+    }
+
+    void
+    scale(const float& f)
+    {
+        binary32 bfp;
+        bfp.fp32 = f;
+        int scale_val = bfp.exp - bfp.bias;
+
+        // Scale value of 0xFF is NaN. Scaling by NaN returns NaN.
+        // In this implementation, types without NaN define it as zero.
+        if (scale_val == 0xFF) {
+            data = FMT::nan;
+            return;
+        }
+
+        FMT in = getFmt();
+        int exp = in.exp;
+
+        if (exp + scale_val > max_exp<FMT>()) {
+            in.exp = max_exp<FMT>();
+        } else if (exp + scale_val < min_exp<FMT>()) {
+            in.exp = min_exp<FMT>();
+        } else {
+            in.exp = exp + scale_val;
+        }
+
+        data = in.storage;
+    }
+
+  private:
+    mxfpRoundingMode mode = roundTiesToEven;
+
+    uint32_t
+    float_to_mxfp(float f)
+    {
+        if (std::isinf(f)) {
+            assert(std::numeric_limits<FMT>::has_infinity);
+            return FMT::inf;
+        }
+
+        if (std::isnan(f)) {
+            assert(std::numeric_limits<FMT>::has_quiet_NaN);
+            return FMT::nan;
+        }
+
+        return float_to_mxfp_nocheck(f);
+    }
+
+    uint32_t
+    float_to_mxfp_nocheck(float f)
+    {
+        binary32 in;
+        in.fp32 = f;
+
+        FMT out;
+        out.storage = 0;
+
+        out = convertMXFP<FMT, binary32>(in, mode);
+
+        return out.storage;
+    }
+};
+
+// Unary operators
+template<typename T>
+inline T operator+(T a)
+{
+    return a;
+}
+
+template<typename T>
+inline T operator-(T a)
+{
+    // Flip sign bit
+    a.data ^= 0x80000000;
+    return a;
+}
+
+template<typename T>
+inline T operator++(T a)
+{
+    a = a + T(1.0f);
+    return a;
+}
+
+template<typename T>
+inline T operator--(T a)
+{
+    a = a - T(1.0f);
+    return a;
+}
+
+template<typename T>
+inline T operator++(T a, int)
+{
+    T original = a;
+    ++a;
+    return original;
+}
+
+template<typename T>
+inline T operator--(T a, int)
+{
+    T original = a;
+    --a;
+    return original;
+}
+
+// Math operators
+template<typename T>
+inline T operator+(T a, T b)
+{
+    return T(float(a) + float(b));
+}
+
+template<typename T>
+inline T operator-(T a, T b)
+{
+    return T(float(a) - float(b));
+}
+
+template<typename T>
+inline T operator*(T a, T b)
+{
+    return T(float(a) * float(b));
+}
+
+template<typename T>
+inline T operator/(T a, T b)
+{
+    return T(float(a) / float(b));
+}
+
+template<typename T>
+inline T operator+=(T &a, T b)
+{
+    a = a + b;
+    return a;
+}
+
+template<typename T>
+inline T operator-=(T &a, T b)
+{
+    a = a - b;
+    return a;
+}
+
+template<typename T>
+inline T operator*=(T &a, T b)
+{
+    a = a * b;
+    return a;
+}
+
+template<typename T>
+inline T operator/=(T &a, T b)
+{
+    a = a / b;
+    return a;
+}
+
+// Comparison operators
+template<typename T>
+inline bool operator<(T a, T b)
+{
+    return float(a) < float(b);
+}
+
+template<typename T>
+inline bool operator>(T a, T b)
+{
+    return float(a) > float(b);
+}
+
+template<typename T>
+inline bool operator<=(T a, T b)
+{
+    return float(a) <= float(b);
+}
+
+template<typename T>
+inline bool operator>=(T a, T b)
+{
+    return float(a) >= float(b);
+}
+
+template<typename T>
+inline bool operator==(T a, T b)
+{
+    return float(a) == float(b);
+}
+
+template<typename T>
+inline bool operator!=(T a, T b)
+{
+    return float(a) != float(b);
+}
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_HH__
diff --git a/src/arch/amdgpu/common/dtype/mxfp.test.cc b/src/arch/amdgpu/common/dtype/mxfp.test.cc
new file mode 100644
index 0000000000..ca7b2fac60
--- /dev/null
+++ b/src/arch/amdgpu/common/dtype/mxfp.test.cc
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <gtest/gtest.h>
+
+#include "arch/amdgpu/common/dtype/mxfp_types.hh"
+
+template<typename T>
+bool test_raw_mxfp(T raw_mxfp, int bits)
+{
+    float tmp = float(raw_mxfp);
+    T from_float(tmp);
+
+    // Simply check that casting to float and back yields the same bit values.
+    // Exclude inf/NaN as those have multiple values in some MXFP types.
+    if (raw_mxfp.data != from_float.data &&
+        !std::isnan(tmp) && !std::isinf(tmp)) {
+        return false;
+    }
+
+    return true;
+}
+
+template<typename T>
+int test_type(int bits)
+{
+    T raw_mxfp;
+    int errors = 0;
+
+    int max_val = 1 << bits;
+    for (int val = 0; val < max_val; ++val) {
+        // Raw data is aligned to MSb in MXFP types. Shift into place.
+        raw_mxfp.data = val << (32 - bits);
+        if (!test_raw_mxfp(raw_mxfp, bits)) {
+            errors++;
+        }
+    }
+
+    return errors;
+}
+
+TEST(MxfpTest, MxBf16Test)
+{
+    using T = gem5::AMDGPU::mxbfloat16;
+
+    int errors = test_type<T>(T::size());
+
+    EXPECT_EQ(errors, 0);
+}
+
+TEST(MxfpTest, MxFp16Test)
+{
+    using T = gem5::AMDGPU::mxfloat16;
+
+    int errors = test_type<T>(T::size());
+
+    EXPECT_EQ(errors, 0);
+}
+
+TEST(MxfpTest, MxBf8Test)
+{
+    using T = gem5::AMDGPU::mxbfloat8;
+
+    int errors = test_type<T>(T::size());
+
+    EXPECT_EQ(errors, 0);
+}
+
+TEST(MxfpTest, MxFp8Test)
+{
+    using T = gem5::AMDGPU::mxfloat8;
+
+    int errors = test_type<T>(T::size());
+
+    EXPECT_EQ(errors, 0);
+}
diff --git a/src/arch/amdgpu/common/dtype/mxfp_convert.hh b/src/arch/amdgpu/common/dtype/mxfp_convert.hh
new file mode 100644
index 0000000000..641d5f5732
--- /dev/null
+++ b/src/arch/amdgpu/common/dtype/mxfp_convert.hh
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_CONVERT_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_CONVERT_HH__
+
+#include <cassert>
+
+#include "arch/amdgpu/common/dtype/mxfp_type_info.hh"
+#include "base/bitfield.hh"
+
+namespace gem5
+{
+
+namespace AMDGPU
+{
+
+// The various rounding modes for microscaling formats. roundTiesToEven must
+// be supported. Other rounding modes may be supported.
+enum mxfpRoundingMode
+{
+    roundTiesToEven,
+    roundStochastic
+};
+
+// Conversion functions - For instructions that convert from one microscaling
+// format to another. We only need the conversion functions as there do not
+// appear to be any instructions yet which operate directly on the MX formats.
+//
+// in - An MXFP info struct type
+// mode - rounding mode
+// seed - input value for stochastic rounding function
+template<typename dFMT, typename sFMT>
+dFMT convertMXFP(sFMT in, mxfpRoundingMode mode = roundTiesToEven,
+                 uint32_t seed = 0)
+{
+    // We assume that *both* exponent and mantissa bits are both >= or <=
+    // the target type. Checkable at compile time.
+    //
+    // This is not necessarily a limitation, others just are not implemented.
+    // Figuring this out would be interesting for converting FP8 <-> BF8 for
+    // example. So far all GPU conversion instructions convert explicitly to
+    // a larger type from a smaller type or smaller to larger.
+    static_assert(((int(sFMT::mbits) >= int(dFMT::mbits)) &&
+                   (int(sFMT::ebits) >= int(dFMT::ebits)))
+               || ((int(sFMT::mbits) <= int(dFMT::mbits)) &&
+                   (int(sFMT::ebits) <= int(dFMT::ebits))));
+
+    dFMT out;
+    out.storage = 0;
+
+    if (int(sFMT::mbits) >= int(dFMT::mbits) &&
+        int(sFMT::ebits) >= int(dFMT::ebits)) {
+        // Input format is larger, truncate and round mantissa. MX formats
+        // are subnormal if exp == 0. Zero out exp in that case.
+
+        if (std::isnan(in)) {
+            // For types with no NaN return max value.
+            if (std::numeric_limits<dFMT>::has_quiet_NaN) {
+                out = std::numeric_limits<dFMT>::quiet_NaN();
+            } else {
+                out = std::numeric_limits<dFMT>::max();
+            }
+        } else if (std::isinf(in)) {
+            // For types with no Inf return max value.
+            if (std::numeric_limits<dFMT>::has_infinity) {
+                out = std::numeric_limits<dFMT>::infinity();
+            } else {
+                out = std::numeric_limits<dFMT>::max();
+            }
+        } else if (in.mant == 0 && in.exp == 0) {
+            // All MX formats FP32, and FP64 encode 0 as all zeros. Keep sign.
+            out.mant = 0;
+            out.exp  = 0;
+            out.sign = in.sign;
+        } else {
+            // Extra bits are needed for the mantissa conversion.
+            uint32_t mant = in.mant & mask(sFMT::mbits);
+            int32_t exp   = in.exp - sFMT::bias + dFMT::bias;
+            out.sign = in.sign;
+
+            // Input is not subnormal, add the implicit 1 bit.
+            if (in.exp) {
+                mant |= (1 << sFMT::mbits);
+            }
+
+            mant >>= (sFMT::mbits - dFMT::mbits);
+
+            // Output became subnormal
+            if (exp < 1) {
+                int shift = 1 - exp;
+                mant >>= shift;
+                out.exp = 0;
+            } else {
+                out.exp = exp;
+            }
+
+            mant &= mask(dFMT::mbits);
+            out.mant = mant;
+
+            // roundTiesToEven is the only required rounding mode for MXFP
+            // types. Here we take the original mantissa and check the final
+            // bit which is shifted out when converting the mantissa. If that
+            // value is one, then we should round up to the next representable
+            // number. If the value is one and all other discarded mantissa
+            // bits are zero, round towards the number which has an even (0)
+            // bit value in the least significant mantissa bit.
+            //
+            // For denormals, the process is similar however we check the nth
+            // bit of the converted mantissa, where n is the absolute value of
+            // the converted exponent. If the value of |exp| is larger than
+            // the max exponent, round to zero. If it is exactly equal, always
+            // round up.
+            //
+            // If the number of destination and source format mantissa bits are
+            // the same, the mantissa is unchanged.
+            if (int(sFMT::mbits) > int(dFMT::mbits)
+                    && mode == roundTiesToEven) {
+                bool round_up = false;
+
+                int check_shift = sFMT::mbits - dFMT::mbits - 1;
+                uint32_t check_mant = in.mant & mask(sFMT::mbits);
+
+                check_mant >>= check_shift;
+
+                // out.exp == 0 means subnormal
+                if (out.exp == 0) {
+                    check_mant = in.mant >> (sFMT::mbits - dFMT::mbits);
+
+                    uint32_t max_exp = mask(dFMT::ebits);
+                    if (-exp > max_exp) {
+                        // if exp < -(1 << dFMT::ebits), result should be 0
+                        round_up = false;
+                    } else if (-exp == max_exp) {
+                        // if exp == -(1 << dFMT::ebits), round up
+                        round_up = true;
+                    } else {
+                        // Use the |exp|'th bit to determine rounding
+                        int check_bit = 1 << -exp;
+                        round_up = (check_mant & check_bit);
+                    }
+                } else {
+                    round_up = (check_mant & 0x1);
+                }
+
+                // For roundTiesToEven, if we are exactly between two
+                // representable numbers, pick the one with an even least
+                // significant mantissa bit. We are exactly between when
+                // all of the discarded mantissa bits are 0 (i.e., !sticky).
+                int sticky = in.mant & mask(sFMT::mbits - dFMT::mbits);
+                if (round_up && !sticky) {
+                    if (!(out.mant & 1)) {
+                        round_up = false;
+                    }
+                }
+
+                if (round_up) {
+                    if (out.mant == mask(dFMT::mbits)) {
+                        // mantissa at max value, increment exponent if not inf
+                        if (out.exp != mask(dFMT::ebits)) {
+                            out.exp++;
+                        }
+                        out.mant = 0;
+                    } else {
+                        out.mant++;
+                    }
+                }
+            } else if (int(sFMT::mbits) > int(dFMT::mbits)
+                    && mode == roundStochastic) {
+                // Use the discarded mantissa divided by the max mantissa of
+                // the source format to determine the probability of rounding
+                // up. An alternate implementation of this would be to get a
+                // random number and add that to the input mantissa. Then
+                // follow the normal rounding path above.
+                uint32_t discarded = in.mant & mask(sFMT::mbits - dFMT::mbits);
+                uint32_t max_mant = mask(sFMT::mbits);
+
+                float round_prob = float(discarded) / float(max_mant);
+
+                // Use a stochastic rounding function with the seed value to
+                // determine compare probability. This is implemented as a
+                // "Galois LFSR."
+                auto srFunc = [](uint32_t in) {
+                    uint32_t bit = (in ^ (in >> 1) ^ (in >> 3) ^ (in >> 12));
+                    return (in >> 1) | (bit << 15);
+                };
+
+                // Assume stochastic rounding returns up to max uint32_t.
+                // This will return an FP value between 0.0f and 1.0f.
+                float draw_prob = float(srFunc(seed))
+                    / float(std::numeric_limits<uint32_t>::max());
+
+                // Round up if the number we drew is less than the rounding
+                // probability. E.g., if round_prob is 90% (0.9) we choose
+                // values 0.0f - 0.90f to round up.
+                if (round_prob >= draw_prob) {
+                    if (out.mant == mask(dFMT::mbits)) {
+                        // mantissa at max value, increment exponent if not inf
+                        if (out.exp != mask(dFMT::ebits)) {
+                            out.exp++;
+                        }
+                        out.mant = 0;
+                    } else {
+                        out.mant++;
+                    }
+                }
+            }
+        }
+    } else if (int(sFMT::mbits) <= int(dFMT::mbits) &&
+               int(sFMT::ebits) <= int(dFMT::ebits)) {
+        // Input format is smaller. Extend mantissa / exponent and pad with 0.
+        // Should be the same for all non-stochastic rounding modes.
+
+        if (std::isnan(in)) {
+            // For types with no NaN return max value.
+            if (std::numeric_limits<dFMT>::has_quiet_NaN) {
+                out = std::numeric_limits<dFMT>::quiet_NaN();
+            } else {
+                out = std::numeric_limits<dFMT>::max();
+            }
+        } else if (std::isinf(in)) {
+            // For types with no Inf return max value.
+            if (std::numeric_limits<dFMT>::has_infinity) {
+                out = std::numeric_limits<dFMT>::infinity();
+            } else {
+                out = std::numeric_limits<dFMT>::max();
+            }
+        } else if (in.mant == 0 && in.exp == 0) {
+            // All MX formats FP32, and FP64 encode 0 as all zeros. Keep sign.
+            out.mant = 0;
+            out.exp  = 0;
+            out.sign = in.sign;
+        } else {
+            out.mant = in.mant << (dFMT::mbits - sFMT::mbits);
+            out.exp  = in.exp + dFMT::bias - sFMT::bias;
+            out.sign = in.sign;
+
+            // Normalize input denormals
+            if (!in.exp && int(sFMT::ebits) != int(dFMT::ebits)) {
+                uint32_t m = out.mant;
+                if (m != 0) {
+                    out.exp++;
+                    while (!(m >> dFMT::mbits)) {
+                        m <<= 1;
+                        out.exp--;
+                    }
+                    out.mant = m & mask(dFMT::mbits);
+                }
+            } else if (!in.exp) {
+                // Exponent is the same, but output is not denorm, so add
+                // implicit 1. This is specific mainly to bf16 -> f32.
+                uint32_t m = out.mant;
+                m <<= 1;
+                out.mant = m & mask(dFMT::mbits);
+            }
+        }
+    } else {
+        assert(false);
+    }
+
+    return out;
+}
+
+template<typename FMT>
+int min_exp()
+{
+    return 1;
+}
+
+template<typename FMT>
+int max_exp()
+{
+    return (1 << FMT::ebits) - 1;
+}
+
+
+} // namespace AMDGPU
+
+} // namespace gem5
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_CONVERT_HH__
diff --git a/src/arch/amdgpu/common/dtype/mxfp_type_info.hh b/src/arch/amdgpu/common/dtype/mxfp_type_info.hh
new file mode 100644
index 0000000000..fe433523d6
--- /dev/null
+++ b/src/arch/amdgpu/common/dtype/mxfp_type_info.hh
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPE_INFO_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPE_INFO_HH__
+
+#include "arch/amdgpu/common/dtype/binary32.hh"
+#include "arch/amdgpu/common/dtype/fp16_e5m10.hh"
+#include "arch/amdgpu/common/dtype/fp16_e8m7.hh"
+#include "arch/amdgpu/common/dtype/fp8_e4m3.hh"
+#include "arch/amdgpu/common/dtype/fp8_e5m2.hh"
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPE_INFO_HH__
diff --git a/src/arch/amdgpu/common/dtype/mxfp_types.hh b/src/arch/amdgpu/common/dtype/mxfp_types.hh
new file mode 100644
index 0000000000..29155901d4
--- /dev/null
+++ b/src/arch/amdgpu/common/dtype/mxfp_types.hh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPES_HH__
+#define __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPES_HH__
+
+#include "arch/amdgpu/common/dtype/mxfp.hh"
+
+namespace gem5
+{
+namespace AMDGPU
+{
+
+using mxbfloat8 = mxfp<fp8_e5m2_info>;
+using mxfloat8 = mxfp<fp8_e4m3_info>;
+
+using mxbfloat16 = mxfp<fp16_e8m7_info>;
+using mxfloat16 = mxfp<fp16_e5m10_info>;
+
+using mxfloat32 = mxfp<binary32>;
+
+}
+}
+
+#endif // __ARCH_AMDGPU_COMMON_DTYPE_MXFP_TYPES_HH__
diff --git a/src/arch/amdgpu/gcn3/decoder.cc b/src/arch/amdgpu/gcn3/decoder.cc
deleted file mode 100644
index c0fcc3a7dd..0000000000
--- a/src/arch/amdgpu/gcn3/decoder.cc
+++ /dev/null
@@ -1,10814 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <vector>
-
-#include "arch/amdgpu/gcn3/gpu_decoder.hh"
-#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
-#include "arch/amdgpu/gcn3/insts/instructions.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    Decoder::Decoder()
-    {
-    } // Decoder
-
-    Decoder::~Decoder()
-    {
-    } // ~Decoder
-
-    IsaDecodeMethod Decoder::tableDecodePrimary[] = {
-        &Decoder::decode_OP_VOP2__V_CNDMASK_B32,
-        &Decoder::decode_OP_VOP2__V_CNDMASK_B32,
-        &Decoder::decode_OP_VOP2__V_CNDMASK_B32,
-        &Decoder::decode_OP_VOP2__V_CNDMASK_B32,
-        &Decoder::decode_OP_VOP2__V_ADD_F32,
-        &Decoder::decode_OP_VOP2__V_ADD_F32,
-        &Decoder::decode_OP_VOP2__V_ADD_F32,
-        &Decoder::decode_OP_VOP2__V_ADD_F32,
-        &Decoder::decode_OP_VOP2__V_SUB_F32,
-        &Decoder::decode_OP_VOP2__V_SUB_F32,
-        &Decoder::decode_OP_VOP2__V_SUB_F32,
-        &Decoder::decode_OP_VOP2__V_SUB_F32,
-        &Decoder::decode_OP_VOP2__V_SUBREV_F32,
-        &Decoder::decode_OP_VOP2__V_SUBREV_F32,
-        &Decoder::decode_OP_VOP2__V_SUBREV_F32,
-        &Decoder::decode_OP_VOP2__V_SUBREV_F32,
-        &Decoder::decode_OP_VOP2__V_MUL_LEGACY_F32,
-        &Decoder::decode_OP_VOP2__V_MUL_LEGACY_F32,
-        &Decoder::decode_OP_VOP2__V_MUL_LEGACY_F32,
-        &Decoder::decode_OP_VOP2__V_MUL_LEGACY_F32,
-        &Decoder::decode_OP_VOP2__V_MUL_F32,
-        &Decoder::decode_OP_VOP2__V_MUL_F32,
-        &Decoder::decode_OP_VOP2__V_MUL_F32,
-        &Decoder::decode_OP_VOP2__V_MUL_F32,
-        &Decoder::decode_OP_VOP2__V_MUL_I32_I24,
-        &Decoder::decode_OP_VOP2__V_MUL_I32_I24,
-        &Decoder::decode_OP_VOP2__V_MUL_I32_I24,
-        &Decoder::decode_OP_VOP2__V_MUL_I32_I24,
-        &Decoder::decode_OP_VOP2__V_MUL_HI_I32_I24,
-        &Decoder::decode_OP_VOP2__V_MUL_HI_I32_I24,
-        &Decoder::decode_OP_VOP2__V_MUL_HI_I32_I24,
-        &Decoder::decode_OP_VOP2__V_MUL_HI_I32_I24,
-        &Decoder::decode_OP_VOP2__V_MUL_U32_U24,
-        &Decoder::decode_OP_VOP2__V_MUL_U32_U24,
-        &Decoder::decode_OP_VOP2__V_MUL_U32_U24,
-        &Decoder::decode_OP_VOP2__V_MUL_U32_U24,
-        &Decoder::decode_OP_VOP2__V_MUL_HI_U32_U24,
-        &Decoder::decode_OP_VOP2__V_MUL_HI_U32_U24,
-        &Decoder::decode_OP_VOP2__V_MUL_HI_U32_U24,
-        &Decoder::decode_OP_VOP2__V_MUL_HI_U32_U24,
-        &Decoder::decode_OP_VOP2__V_MIN_F32,
-        &Decoder::decode_OP_VOP2__V_MIN_F32,
-        &Decoder::decode_OP_VOP2__V_MIN_F32,
-        &Decoder::decode_OP_VOP2__V_MIN_F32,
-        &Decoder::decode_OP_VOP2__V_MAX_F32,
-        &Decoder::decode_OP_VOP2__V_MAX_F32,
-        &Decoder::decode_OP_VOP2__V_MAX_F32,
-        &Decoder::decode_OP_VOP2__V_MAX_F32,
-        &Decoder::decode_OP_VOP2__V_MIN_I32,
-        &Decoder::decode_OP_VOP2__V_MIN_I32,
-        &Decoder::decode_OP_VOP2__V_MIN_I32,
-        &Decoder::decode_OP_VOP2__V_MIN_I32,
-        &Decoder::decode_OP_VOP2__V_MAX_I32,
-        &Decoder::decode_OP_VOP2__V_MAX_I32,
-        &Decoder::decode_OP_VOP2__V_MAX_I32,
-        &Decoder::decode_OP_VOP2__V_MAX_I32,
-        &Decoder::decode_OP_VOP2__V_MIN_U32,
-        &Decoder::decode_OP_VOP2__V_MIN_U32,
-        &Decoder::decode_OP_VOP2__V_MIN_U32,
-        &Decoder::decode_OP_VOP2__V_MIN_U32,
-        &Decoder::decode_OP_VOP2__V_MAX_U32,
-        &Decoder::decode_OP_VOP2__V_MAX_U32,
-        &Decoder::decode_OP_VOP2__V_MAX_U32,
-        &Decoder::decode_OP_VOP2__V_MAX_U32,
-        &Decoder::decode_OP_VOP2__V_LSHRREV_B32,
-        &Decoder::decode_OP_VOP2__V_LSHRREV_B32,
-        &Decoder::decode_OP_VOP2__V_LSHRREV_B32,
-        &Decoder::decode_OP_VOP2__V_LSHRREV_B32,
-        &Decoder::decode_OP_VOP2__V_ASHRREV_I32,
-        &Decoder::decode_OP_VOP2__V_ASHRREV_I32,
-        &Decoder::decode_OP_VOP2__V_ASHRREV_I32,
-        &Decoder::decode_OP_VOP2__V_ASHRREV_I32,
-        &Decoder::decode_OP_VOP2__V_LSHLREV_B32,
-        &Decoder::decode_OP_VOP2__V_LSHLREV_B32,
-        &Decoder::decode_OP_VOP2__V_LSHLREV_B32,
-        &Decoder::decode_OP_VOP2__V_LSHLREV_B32,
-        &Decoder::decode_OP_VOP2__V_AND_B32,
-        &Decoder::decode_OP_VOP2__V_AND_B32,
-        &Decoder::decode_OP_VOP2__V_AND_B32,
-        &Decoder::decode_OP_VOP2__V_AND_B32,
-        &Decoder::decode_OP_VOP2__V_OR_B32,
-        &Decoder::decode_OP_VOP2__V_OR_B32,
-        &Decoder::decode_OP_VOP2__V_OR_B32,
-        &Decoder::decode_OP_VOP2__V_OR_B32,
-        &Decoder::decode_OP_VOP2__V_XOR_B32,
-        &Decoder::decode_OP_VOP2__V_XOR_B32,
-        &Decoder::decode_OP_VOP2__V_XOR_B32,
-        &Decoder::decode_OP_VOP2__V_XOR_B32,
-        &Decoder::decode_OP_VOP2__V_MAC_F32,
-        &Decoder::decode_OP_VOP2__V_MAC_F32,
-        &Decoder::decode_OP_VOP2__V_MAC_F32,
-        &Decoder::decode_OP_VOP2__V_MAC_F32,
-        &Decoder::decode_OP_VOP2__V_MADMK_F32,
-        &Decoder::decode_OP_VOP2__V_MADMK_F32,
-        &Decoder::decode_OP_VOP2__V_MADMK_F32,
-        &Decoder::decode_OP_VOP2__V_MADMK_F32,
-        &Decoder::decode_OP_VOP2__V_MADAK_F32,
-        &Decoder::decode_OP_VOP2__V_MADAK_F32,
-        &Decoder::decode_OP_VOP2__V_MADAK_F32,
-        &Decoder::decode_OP_VOP2__V_MADAK_F32,
-        &Decoder::decode_OP_VOP2__V_ADD_U32,
-        &Decoder::decode_OP_VOP2__V_ADD_U32,
-        &Decoder::decode_OP_VOP2__V_ADD_U32,
-        &Decoder::decode_OP_VOP2__V_ADD_U32,
-        &Decoder::decode_OP_VOP2__V_SUB_U32,
-        &Decoder::decode_OP_VOP2__V_SUB_U32,
-        &Decoder::decode_OP_VOP2__V_SUB_U32,
-        &Decoder::decode_OP_VOP2__V_SUB_U32,
-        &Decoder::decode_OP_VOP2__V_SUBREV_U32,
-        &Decoder::decode_OP_VOP2__V_SUBREV_U32,
-        &Decoder::decode_OP_VOP2__V_SUBREV_U32,
-        &Decoder::decode_OP_VOP2__V_SUBREV_U32,
-        &Decoder::decode_OP_VOP2__V_ADDC_U32,
-        &Decoder::decode_OP_VOP2__V_ADDC_U32,
-        &Decoder::decode_OP_VOP2__V_ADDC_U32,
-        &Decoder::decode_OP_VOP2__V_ADDC_U32,
-        &Decoder::decode_OP_VOP2__V_SUBB_U32,
-        &Decoder::decode_OP_VOP2__V_SUBB_U32,
-        &Decoder::decode_OP_VOP2__V_SUBB_U32,
-        &Decoder::decode_OP_VOP2__V_SUBB_U32,
-        &Decoder::decode_OP_VOP2__V_SUBBREV_U32,
-        &Decoder::decode_OP_VOP2__V_SUBBREV_U32,
-        &Decoder::decode_OP_VOP2__V_SUBBREV_U32,
-        &Decoder::decode_OP_VOP2__V_SUBBREV_U32,
-        &Decoder::decode_OP_VOP2__V_ADD_F16,
-        &Decoder::decode_OP_VOP2__V_ADD_F16,
-        &Decoder::decode_OP_VOP2__V_ADD_F16,
-        &Decoder::decode_OP_VOP2__V_ADD_F16,
-        &Decoder::decode_OP_VOP2__V_SUB_F16,
-        &Decoder::decode_OP_VOP2__V_SUB_F16,
-        &Decoder::decode_OP_VOP2__V_SUB_F16,
-        &Decoder::decode_OP_VOP2__V_SUB_F16,
-        &Decoder::decode_OP_VOP2__V_SUBREV_F16,
-        &Decoder::decode_OP_VOP2__V_SUBREV_F16,
-        &Decoder::decode_OP_VOP2__V_SUBREV_F16,
-        &Decoder::decode_OP_VOP2__V_SUBREV_F16,
-        &Decoder::decode_OP_VOP2__V_MUL_F16,
-        &Decoder::decode_OP_VOP2__V_MUL_F16,
-        &Decoder::decode_OP_VOP2__V_MUL_F16,
-        &Decoder::decode_OP_VOP2__V_MUL_F16,
-        &Decoder::decode_OP_VOP2__V_MAC_F16,
-        &Decoder::decode_OP_VOP2__V_MAC_F16,
-        &Decoder::decode_OP_VOP2__V_MAC_F16,
-        &Decoder::decode_OP_VOP2__V_MAC_F16,
-        &Decoder::decode_OP_VOP2__V_MADMK_F16,
-        &Decoder::decode_OP_VOP2__V_MADMK_F16,
-        &Decoder::decode_OP_VOP2__V_MADMK_F16,
-        &Decoder::decode_OP_VOP2__V_MADMK_F16,
-        &Decoder::decode_OP_VOP2__V_MADAK_F16,
-        &Decoder::decode_OP_VOP2__V_MADAK_F16,
-        &Decoder::decode_OP_VOP2__V_MADAK_F16,
-        &Decoder::decode_OP_VOP2__V_MADAK_F16,
-        &Decoder::decode_OP_VOP2__V_ADD_U16,
-        &Decoder::decode_OP_VOP2__V_ADD_U16,
-        &Decoder::decode_OP_VOP2__V_ADD_U16,
-        &Decoder::decode_OP_VOP2__V_ADD_U16,
-        &Decoder::decode_OP_VOP2__V_SUB_U16,
-        &Decoder::decode_OP_VOP2__V_SUB_U16,
-        &Decoder::decode_OP_VOP2__V_SUB_U16,
-        &Decoder::decode_OP_VOP2__V_SUB_U16,
-        &Decoder::decode_OP_VOP2__V_SUBREV_U16,
-        &Decoder::decode_OP_VOP2__V_SUBREV_U16,
-        &Decoder::decode_OP_VOP2__V_SUBREV_U16,
-        &Decoder::decode_OP_VOP2__V_SUBREV_U16,
-        &Decoder::decode_OP_VOP2__V_MUL_LO_U16,
-        &Decoder::decode_OP_VOP2__V_MUL_LO_U16,
-        &Decoder::decode_OP_VOP2__V_MUL_LO_U16,
-        &Decoder::decode_OP_VOP2__V_MUL_LO_U16,
-        &Decoder::decode_OP_VOP2__V_LSHLREV_B16,
-        &Decoder::decode_OP_VOP2__V_LSHLREV_B16,
-        &Decoder::decode_OP_VOP2__V_LSHLREV_B16,
-        &Decoder::decode_OP_VOP2__V_LSHLREV_B16,
-        &Decoder::decode_OP_VOP2__V_LSHRREV_B16,
-        &Decoder::decode_OP_VOP2__V_LSHRREV_B16,
-        &Decoder::decode_OP_VOP2__V_LSHRREV_B16,
-        &Decoder::decode_OP_VOP2__V_LSHRREV_B16,
-        &Decoder::decode_OP_VOP2__V_ASHRREV_I16,
-        &Decoder::decode_OP_VOP2__V_ASHRREV_I16,
-        &Decoder::decode_OP_VOP2__V_ASHRREV_I16,
-        &Decoder::decode_OP_VOP2__V_ASHRREV_I16,
-        &Decoder::decode_OP_VOP2__V_MAX_F16,
-        &Decoder::decode_OP_VOP2__V_MAX_F16,
-        &Decoder::decode_OP_VOP2__V_MAX_F16,
-        &Decoder::decode_OP_VOP2__V_MAX_F16,
-        &Decoder::decode_OP_VOP2__V_MIN_F16,
-        &Decoder::decode_OP_VOP2__V_MIN_F16,
-        &Decoder::decode_OP_VOP2__V_MIN_F16,
-        &Decoder::decode_OP_VOP2__V_MIN_F16,
-        &Decoder::decode_OP_VOP2__V_MAX_U16,
-        &Decoder::decode_OP_VOP2__V_MAX_U16,
-        &Decoder::decode_OP_VOP2__V_MAX_U16,
-        &Decoder::decode_OP_VOP2__V_MAX_U16,
-        &Decoder::decode_OP_VOP2__V_MAX_I16,
-        &Decoder::decode_OP_VOP2__V_MAX_I16,
-        &Decoder::decode_OP_VOP2__V_MAX_I16,
-        &Decoder::decode_OP_VOP2__V_MAX_I16,
-        &Decoder::decode_OP_VOP2__V_MIN_U16,
-        &Decoder::decode_OP_VOP2__V_MIN_U16,
-        &Decoder::decode_OP_VOP2__V_MIN_U16,
-        &Decoder::decode_OP_VOP2__V_MIN_U16,
-        &Decoder::decode_OP_VOP2__V_MIN_I16,
-        &Decoder::decode_OP_VOP2__V_MIN_I16,
-        &Decoder::decode_OP_VOP2__V_MIN_I16,
-        &Decoder::decode_OP_VOP2__V_MIN_I16,
-        &Decoder::decode_OP_VOP2__V_LDEXP_F16,
-        &Decoder::decode_OP_VOP2__V_LDEXP_F16,
-        &Decoder::decode_OP_VOP2__V_LDEXP_F16,
-        &Decoder::decode_OP_VOP2__V_LDEXP_F16,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::subDecode_OP_VOPC,
-        &Decoder::subDecode_OP_VOPC,
-        &Decoder::subDecode_OP_VOPC,
-        &Decoder::subDecode_OP_VOPC,
-        &Decoder::subDecode_OP_VOP1,
-        &Decoder::subDecode_OP_VOP1,
-        &Decoder::subDecode_OP_VOP1,
-        &Decoder::subDecode_OP_VOP1,
-        &Decoder::decode_OP_SOP2__S_ADD_U32,
-        &Decoder::decode_OP_SOP2__S_SUB_U32,
-        &Decoder::decode_OP_SOP2__S_ADD_I32,
-        &Decoder::decode_OP_SOP2__S_SUB_I32,
-        &Decoder::decode_OP_SOP2__S_ADDC_U32,
-        &Decoder::decode_OP_SOP2__S_SUBB_U32,
-        &Decoder::decode_OP_SOP2__S_MIN_I32,
-        &Decoder::decode_OP_SOP2__S_MIN_U32,
-        &Decoder::decode_OP_SOP2__S_MAX_I32,
-        &Decoder::decode_OP_SOP2__S_MAX_U32,
-        &Decoder::decode_OP_SOP2__S_CSELECT_B32,
-        &Decoder::decode_OP_SOP2__S_CSELECT_B64,
-        &Decoder::decode_OP_SOP2__S_AND_B32,
-        &Decoder::decode_OP_SOP2__S_AND_B64,
-        &Decoder::decode_OP_SOP2__S_OR_B32,
-        &Decoder::decode_OP_SOP2__S_OR_B64,
-        &Decoder::decode_OP_SOP2__S_XOR_B32,
-        &Decoder::decode_OP_SOP2__S_XOR_B64,
-        &Decoder::decode_OP_SOP2__S_ANDN2_B32,
-        &Decoder::decode_OP_SOP2__S_ANDN2_B64,
-        &Decoder::decode_OP_SOP2__S_ORN2_B32,
-        &Decoder::decode_OP_SOP2__S_ORN2_B64,
-        &Decoder::decode_OP_SOP2__S_NAND_B32,
-        &Decoder::decode_OP_SOP2__S_NAND_B64,
-        &Decoder::decode_OP_SOP2__S_NOR_B32,
-        &Decoder::decode_OP_SOP2__S_NOR_B64,
-        &Decoder::decode_OP_SOP2__S_XNOR_B32,
-        &Decoder::decode_OP_SOP2__S_XNOR_B64,
-        &Decoder::decode_OP_SOP2__S_LSHL_B32,
-        &Decoder::decode_OP_SOP2__S_LSHL_B64,
-        &Decoder::decode_OP_SOP2__S_LSHR_B32,
-        &Decoder::decode_OP_SOP2__S_LSHR_B64,
-        &Decoder::decode_OP_SOP2__S_ASHR_I32,
-        &Decoder::decode_OP_SOP2__S_ASHR_I64,
-        &Decoder::decode_OP_SOP2__S_BFM_B32,
-        &Decoder::decode_OP_SOP2__S_BFM_B64,
-        &Decoder::decode_OP_SOP2__S_MUL_I32,
-        &Decoder::decode_OP_SOP2__S_BFE_U32,
-        &Decoder::decode_OP_SOP2__S_BFE_I32,
-        &Decoder::decode_OP_SOP2__S_BFE_U64,
-        &Decoder::decode_OP_SOP2__S_BFE_I64,
-        &Decoder::decode_OP_SOP2__S_CBRANCH_G_FORK,
-        &Decoder::decode_OP_SOP2__S_ABSDIFF_I32,
-        &Decoder::decode_OP_SOP2__S_RFE_RESTORE_B64,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_SOPK__S_MOVK_I32,
-        &Decoder::decode_OP_SOPK__S_CMOVK_I32,
-        &Decoder::decode_OP_SOPK__S_CMPK_EQ_I32,
-        &Decoder::decode_OP_SOPK__S_CMPK_LG_I32,
-        &Decoder::decode_OP_SOPK__S_CMPK_GT_I32,
-        &Decoder::decode_OP_SOPK__S_CMPK_GE_I32,
-        &Decoder::decode_OP_SOPK__S_CMPK_LT_I32,
-        &Decoder::decode_OP_SOPK__S_CMPK_LE_I32,
-        &Decoder::decode_OP_SOPK__S_CMPK_EQ_U32,
-        &Decoder::decode_OP_SOPK__S_CMPK_LG_U32,
-        &Decoder::decode_OP_SOPK__S_CMPK_GT_U32,
-        &Decoder::decode_OP_SOPK__S_CMPK_GE_U32,
-        &Decoder::decode_OP_SOPK__S_CMPK_LT_U32,
-        &Decoder::decode_OP_SOPK__S_CMPK_LE_U32,
-        &Decoder::decode_OP_SOPK__S_ADDK_I32,
-        &Decoder::decode_OP_SOPK__S_MULK_I32,
-        &Decoder::decode_OP_SOPK__S_CBRANCH_I_FORK,
-        &Decoder::decode_OP_SOPK__S_GETREG_B32,
-        &Decoder::decode_OP_SOPK__S_SETREG_B32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_SOPK__S_SETREG_IMM32_B32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::subDecode_OP_SOP1,
-        &Decoder::subDecode_OP_SOPC,
-        &Decoder::subDecode_OP_SOPP,
-        &Decoder::subDecode_OP_SMEM,
-        &Decoder::subDecode_OP_SMEM,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_EXP,
-        &Decoder::decode_OP_EXP,
-        &Decoder::decode_OP_EXP,
-        &Decoder::decode_OP_EXP,
-        &Decoder::decode_OP_EXP,
-        &Decoder::decode_OP_EXP,
-        &Decoder::decode_OP_EXP,
-        &Decoder::decode_OP_EXP,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::subDecode_OPU_VOP3,
-        &Decoder::subDecode_OPU_VOP3,
-        &Decoder::subDecode_OPU_VOP3,
-        &Decoder::subDecode_OPU_VOP3,
-        &Decoder::subDecode_OPU_VOP3,
-        &Decoder::subDecode_OPU_VOP3,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::subDecode_OP_VINTRP,
-        &Decoder::subDecode_OP_VINTRP,
-        &Decoder::subDecode_OP_VINTRP,
-        &Decoder::subDecode_OP_VINTRP,
-        &Decoder::subDecode_OP_VINTRP,
-        &Decoder::subDecode_OP_VINTRP,
-        &Decoder::subDecode_OP_VINTRP,
-        &Decoder::subDecode_OP_VINTRP,
-        &Decoder::subDecode_OP_DS,
-        &Decoder::subDecode_OP_DS,
-        &Decoder::subDecode_OP_DS,
-        &Decoder::subDecode_OP_DS,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::subDecode_OP_FLAT,
-        &Decoder::decode_invalid,
-        &Decoder::subDecode_OP_FLAT,
-        &Decoder::subDecode_OP_FLAT,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::subDecode_OP_MUBUF,
-        &Decoder::subDecode_OP_MUBUF,
-        &Decoder::subDecode_OP_MUBUF,
-        &Decoder::subDecode_OP_MUBUF,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::subDecode_OP_MTBUF,
-        &Decoder::subDecode_OP_MTBUF,
-        &Decoder::subDecode_OP_MTBUF,
-        &Decoder::subDecode_OP_MTBUF,
-        &Decoder::subDecode_OP_MTBUF,
-        &Decoder::subDecode_OP_MTBUF,
-        &Decoder::subDecode_OP_MTBUF,
-        &Decoder::subDecode_OP_MTBUF,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::subDecode_OP_MIMG,
-        &Decoder::subDecode_OP_MIMG,
-        &Decoder::subDecode_OP_MIMG,
-        &Decoder::subDecode_OP_MIMG,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OPU_VOP3[] = {
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_CMP_CLASS_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_CLASS_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_CLASS_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_CLASS_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_CLASS_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_CLASS_F16,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_CMP_F_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_LT_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_EQ_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_LE_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_GT_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_LG_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_GE_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_O_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_U_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_NGE_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_NLG_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_NGT_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_NLE_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_NEQ_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_NLT_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_TRU_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_F_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LT_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_EQ_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LE_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GT_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LG_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GE_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_O_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_U_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NGE_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NLG_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NGT_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NLE_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NEQ_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NLT_F16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_TRU_F16,
-        &Decoder::decode_OPU_VOP3__V_CMP_F_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_LT_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_EQ_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_LE_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_GT_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_LG_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_GE_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_O_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_U_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_NGE_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_NLG_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_NGT_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_NLE_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_NEQ_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_NLT_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_TRU_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_F_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LT_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_EQ_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LE_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GT_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LG_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GE_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_O_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_U_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NGE_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NLG_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NGT_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NLE_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NEQ_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NLT_F32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_TRU_F32,
-        &Decoder::decode_OPU_VOP3__V_CMP_F_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_LT_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_EQ_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_LE_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_GT_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_LG_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_GE_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_O_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_U_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_NGE_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_NLG_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_NGT_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_NLE_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_NEQ_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_NLT_F64,
-        &Decoder::decode_OPU_VOP3__V_CMP_TRU_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_F_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LT_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_EQ_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LE_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GT_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LG_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GE_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_O_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_U_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NGE_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NLG_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NGT_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NLE_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NEQ_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NLT_F64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_TRU_F64,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_CMP_F_I16,
-        &Decoder::decode_OPU_VOP3__V_CMP_LT_I16,
-        &Decoder::decode_OPU_VOP3__V_CMP_EQ_I16,
-        &Decoder::decode_OPU_VOP3__V_CMP_LE_I16,
-        &Decoder::decode_OPU_VOP3__V_CMP_GT_I16,
-        &Decoder::decode_OPU_VOP3__V_CMP_NE_I16,
-        &Decoder::decode_OPU_VOP3__V_CMP_GE_I16,
-        &Decoder::decode_OPU_VOP3__V_CMP_T_I16,
-        &Decoder::decode_OPU_VOP3__V_CMP_F_U16,
-        &Decoder::decode_OPU_VOP3__V_CMP_LT_U16,
-        &Decoder::decode_OPU_VOP3__V_CMP_EQ_U16,
-        &Decoder::decode_OPU_VOP3__V_CMP_LE_U16,
-        &Decoder::decode_OPU_VOP3__V_CMP_GT_U16,
-        &Decoder::decode_OPU_VOP3__V_CMP_NE_U16,
-        &Decoder::decode_OPU_VOP3__V_CMP_GE_U16,
-        &Decoder::decode_OPU_VOP3__V_CMP_T_U16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_F_I16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LT_I16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_EQ_I16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LE_I16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GT_I16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NE_I16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GE_I16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_T_I16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_F_U16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LT_U16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_EQ_U16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LE_U16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GT_U16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NE_U16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GE_U16,
-        &Decoder::decode_OPU_VOP3__V_CMPX_T_U16,
-        &Decoder::decode_OPU_VOP3__V_CMP_F_I32,
-        &Decoder::decode_OPU_VOP3__V_CMP_LT_I32,
-        &Decoder::decode_OPU_VOP3__V_CMP_EQ_I32,
-        &Decoder::decode_OPU_VOP3__V_CMP_LE_I32,
-        &Decoder::decode_OPU_VOP3__V_CMP_GT_I32,
-        &Decoder::decode_OPU_VOP3__V_CMP_NE_I32,
-        &Decoder::decode_OPU_VOP3__V_CMP_GE_I32,
-        &Decoder::decode_OPU_VOP3__V_CMP_T_I32,
-        &Decoder::decode_OPU_VOP3__V_CMP_F_U32,
-        &Decoder::decode_OPU_VOP3__V_CMP_LT_U32,
-        &Decoder::decode_OPU_VOP3__V_CMP_EQ_U32,
-        &Decoder::decode_OPU_VOP3__V_CMP_LE_U32,
-        &Decoder::decode_OPU_VOP3__V_CMP_GT_U32,
-        &Decoder::decode_OPU_VOP3__V_CMP_NE_U32,
-        &Decoder::decode_OPU_VOP3__V_CMP_GE_U32,
-        &Decoder::decode_OPU_VOP3__V_CMP_T_U32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_F_I32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LT_I32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_EQ_I32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LE_I32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GT_I32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NE_I32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GE_I32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_T_I32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_F_U32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LT_U32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_EQ_U32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LE_U32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GT_U32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NE_U32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GE_U32,
-        &Decoder::decode_OPU_VOP3__V_CMPX_T_U32,
-        &Decoder::decode_OPU_VOP3__V_CMP_F_I64,
-        &Decoder::decode_OPU_VOP3__V_CMP_LT_I64,
-        &Decoder::decode_OPU_VOP3__V_CMP_EQ_I64,
-        &Decoder::decode_OPU_VOP3__V_CMP_LE_I64,
-        &Decoder::decode_OPU_VOP3__V_CMP_GT_I64,
-        &Decoder::decode_OPU_VOP3__V_CMP_NE_I64,
-        &Decoder::decode_OPU_VOP3__V_CMP_GE_I64,
-        &Decoder::decode_OPU_VOP3__V_CMP_T_I64,
-        &Decoder::decode_OPU_VOP3__V_CMP_F_U64,
-        &Decoder::decode_OPU_VOP3__V_CMP_LT_U64,
-        &Decoder::decode_OPU_VOP3__V_CMP_EQ_U64,
-        &Decoder::decode_OPU_VOP3__V_CMP_LE_U64,
-        &Decoder::decode_OPU_VOP3__V_CMP_GT_U64,
-        &Decoder::decode_OPU_VOP3__V_CMP_NE_U64,
-        &Decoder::decode_OPU_VOP3__V_CMP_GE_U64,
-        &Decoder::decode_OPU_VOP3__V_CMP_T_U64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_F_I64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LT_I64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_EQ_I64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LE_I64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GT_I64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NE_I64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GE_I64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_T_I64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_F_U64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LT_U64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_EQ_U64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_LE_U64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GT_U64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_NE_U64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_GE_U64,
-        &Decoder::decode_OPU_VOP3__V_CMPX_T_U64,
-        &Decoder::decode_OPU_VOP3__V_CNDMASK_B32,
-        &Decoder::decode_OPU_VOP3__V_ADD_F32,
-        &Decoder::decode_OPU_VOP3__V_SUB_F32,
-        &Decoder::decode_OPU_VOP3__V_SUBREV_F32,
-        &Decoder::decode_OPU_VOP3__V_MUL_LEGACY_F32,
-        &Decoder::decode_OPU_VOP3__V_MUL_F32,
-        &Decoder::decode_OPU_VOP3__V_MUL_I32_I24,
-        &Decoder::decode_OPU_VOP3__V_MUL_HI_I32_I24,
-        &Decoder::decode_OPU_VOP3__V_MUL_U32_U24,
-        &Decoder::decode_OPU_VOP3__V_MUL_HI_U32_U24,
-        &Decoder::decode_OPU_VOP3__V_MIN_F32,
-        &Decoder::decode_OPU_VOP3__V_MAX_F32,
-        &Decoder::decode_OPU_VOP3__V_MIN_I32,
-        &Decoder::decode_OPU_VOP3__V_MAX_I32,
-        &Decoder::decode_OPU_VOP3__V_MIN_U32,
-        &Decoder::decode_OPU_VOP3__V_MAX_U32,
-        &Decoder::decode_OPU_VOP3__V_LSHRREV_B32,
-        &Decoder::decode_OPU_VOP3__V_ASHRREV_I32,
-        &Decoder::decode_OPU_VOP3__V_LSHLREV_B32,
-        &Decoder::decode_OPU_VOP3__V_AND_B32,
-        &Decoder::decode_OPU_VOP3__V_OR_B32,
-        &Decoder::decode_OPU_VOP3__V_XOR_B32,
-        &Decoder::decode_OPU_VOP3__V_MAC_F32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_ADD_U32,
-        &Decoder::decode_OPU_VOP3__V_SUB_U32,
-        &Decoder::decode_OPU_VOP3__V_SUBREV_U32,
-        &Decoder::decode_OPU_VOP3__V_ADDC_U32,
-        &Decoder::decode_OPU_VOP3__V_SUBB_U32,
-        &Decoder::decode_OPU_VOP3__V_SUBBREV_U32,
-        &Decoder::decode_OPU_VOP3__V_ADD_F16,
-        &Decoder::decode_OPU_VOP3__V_SUB_F16,
-        &Decoder::decode_OPU_VOP3__V_SUBREV_F16,
-        &Decoder::decode_OPU_VOP3__V_MUL_F16,
-        &Decoder::decode_OPU_VOP3__V_MAC_F16,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_ADD_U16,
-        &Decoder::decode_OPU_VOP3__V_SUB_U16,
-        &Decoder::decode_OPU_VOP3__V_SUBREV_U16,
-        &Decoder::decode_OPU_VOP3__V_MUL_LO_U16,
-        &Decoder::decode_OPU_VOP3__V_LSHLREV_B16,
-        &Decoder::decode_OPU_VOP3__V_LSHRREV_B16,
-        &Decoder::decode_OPU_VOP3__V_ASHRREV_I16,
-        &Decoder::decode_OPU_VOP3__V_MAX_F16,
-        &Decoder::decode_OPU_VOP3__V_MIN_F16,
-        &Decoder::decode_OPU_VOP3__V_MAX_U16,
-        &Decoder::decode_OPU_VOP3__V_MAX_I16,
-        &Decoder::decode_OPU_VOP3__V_MIN_U16,
-        &Decoder::decode_OPU_VOP3__V_MIN_I16,
-        &Decoder::decode_OPU_VOP3__V_LDEXP_F16,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_NOP,
-        &Decoder::decode_OPU_VOP3__V_MOV_B32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_CVT_I32_F64,
-        &Decoder::decode_OPU_VOP3__V_CVT_F64_I32,
-        &Decoder::decode_OPU_VOP3__V_CVT_F32_I32,
-        &Decoder::decode_OPU_VOP3__V_CVT_F32_U32,
-        &Decoder::decode_OPU_VOP3__V_CVT_U32_F32,
-        &Decoder::decode_OPU_VOP3__V_CVT_I32_F32,
-        &Decoder::decode_OPU_VOP3__V_MOV_FED_B32,
-        &Decoder::decode_OPU_VOP3__V_CVT_F16_F32,
-        &Decoder::decode_OPU_VOP3__V_CVT_F32_F16,
-        &Decoder::decode_OPU_VOP3__V_CVT_RPI_I32_F32,
-        &Decoder::decode_OPU_VOP3__V_CVT_FLR_I32_F32,
-        &Decoder::decode_OPU_VOP3__V_CVT_OFF_F32_I4,
-        &Decoder::decode_OPU_VOP3__V_CVT_F32_F64,
-        &Decoder::decode_OPU_VOP3__V_CVT_F64_F32,
-        &Decoder::decode_OPU_VOP3__V_CVT_F32_UBYTE0,
-        &Decoder::decode_OPU_VOP3__V_CVT_F32_UBYTE1,
-        &Decoder::decode_OPU_VOP3__V_CVT_F32_UBYTE2,
-        &Decoder::decode_OPU_VOP3__V_CVT_F32_UBYTE3,
-        &Decoder::decode_OPU_VOP3__V_CVT_U32_F64,
-        &Decoder::decode_OPU_VOP3__V_CVT_F64_U32,
-        &Decoder::decode_OPU_VOP3__V_TRUNC_F64,
-        &Decoder::decode_OPU_VOP3__V_CEIL_F64,
-        &Decoder::decode_OPU_VOP3__V_RNDNE_F64,
-        &Decoder::decode_OPU_VOP3__V_FLOOR_F64,
-        &Decoder::decode_OPU_VOP3__V_FRACT_F32,
-        &Decoder::decode_OPU_VOP3__V_TRUNC_F32,
-        &Decoder::decode_OPU_VOP3__V_CEIL_F32,
-        &Decoder::decode_OPU_VOP3__V_RNDNE_F32,
-        &Decoder::decode_OPU_VOP3__V_FLOOR_F32,
-        &Decoder::decode_OPU_VOP3__V_EXP_F32,
-        &Decoder::decode_OPU_VOP3__V_LOG_F32,
-        &Decoder::decode_OPU_VOP3__V_RCP_F32,
-        &Decoder::decode_OPU_VOP3__V_RCP_IFLAG_F32,
-        &Decoder::decode_OPU_VOP3__V_RSQ_F32,
-        &Decoder::decode_OPU_VOP3__V_RCP_F64,
-        &Decoder::decode_OPU_VOP3__V_RSQ_F64,
-        &Decoder::decode_OPU_VOP3__V_SQRT_F32,
-        &Decoder::decode_OPU_VOP3__V_SQRT_F64,
-        &Decoder::decode_OPU_VOP3__V_SIN_F32,
-        &Decoder::decode_OPU_VOP3__V_COS_F32,
-        &Decoder::decode_OPU_VOP3__V_NOT_B32,
-        &Decoder::decode_OPU_VOP3__V_BFREV_B32,
-        &Decoder::decode_OPU_VOP3__V_FFBH_U32,
-        &Decoder::decode_OPU_VOP3__V_FFBL_B32,
-        &Decoder::decode_OPU_VOP3__V_FFBH_I32,
-        &Decoder::decode_OPU_VOP3__V_FREXP_EXP_I32_F64,
-        &Decoder::decode_OPU_VOP3__V_FREXP_MANT_F64,
-        &Decoder::decode_OPU_VOP3__V_FRACT_F64,
-        &Decoder::decode_OPU_VOP3__V_FREXP_EXP_I32_F32,
-        &Decoder::decode_OPU_VOP3__V_FREXP_MANT_F32,
-        &Decoder::decode_OPU_VOP3__V_CLREXCP,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_CVT_F16_U16,
-        &Decoder::decode_OPU_VOP3__V_CVT_F16_I16,
-        &Decoder::decode_OPU_VOP3__V_CVT_U16_F16,
-        &Decoder::decode_OPU_VOP3__V_CVT_I16_F16,
-        &Decoder::decode_OPU_VOP3__V_RCP_F16,
-        &Decoder::decode_OPU_VOP3__V_SQRT_F16,
-        &Decoder::decode_OPU_VOP3__V_RSQ_F16,
-        &Decoder::decode_OPU_VOP3__V_LOG_F16,
-        &Decoder::decode_OPU_VOP3__V_EXP_F16,
-        &Decoder::decode_OPU_VOP3__V_FREXP_MANT_F16,
-        &Decoder::decode_OPU_VOP3__V_FREXP_EXP_I16_F16,
-        &Decoder::decode_OPU_VOP3__V_FLOOR_F16,
-        &Decoder::decode_OPU_VOP3__V_CEIL_F16,
-        &Decoder::decode_OPU_VOP3__V_TRUNC_F16,
-        &Decoder::decode_OPU_VOP3__V_RNDNE_F16,
-        &Decoder::decode_OPU_VOP3__V_FRACT_F16,
-        &Decoder::decode_OPU_VOP3__V_SIN_F16,
-        &Decoder::decode_OPU_VOP3__V_COS_F16,
-        &Decoder::decode_OPU_VOP3__V_EXP_LEGACY_F32,
-        &Decoder::decode_OPU_VOP3__V_LOG_LEGACY_F32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_MAD_LEGACY_F32,
-        &Decoder::decode_OPU_VOP3__V_MAD_F32,
-        &Decoder::decode_OPU_VOP3__V_MAD_I32_I24,
-        &Decoder::decode_OPU_VOP3__V_MAD_U32_U24,
-        &Decoder::decode_OPU_VOP3__V_CUBEID_F32,
-        &Decoder::decode_OPU_VOP3__V_CUBESC_F32,
-        &Decoder::decode_OPU_VOP3__V_CUBETC_F32,
-        &Decoder::decode_OPU_VOP3__V_CUBEMA_F32,
-        &Decoder::decode_OPU_VOP3__V_BFE_U32,
-        &Decoder::decode_OPU_VOP3__V_BFE_I32,
-        &Decoder::decode_OPU_VOP3__V_BFI_B32,
-        &Decoder::decode_OPU_VOP3__V_FMA_F32,
-        &Decoder::decode_OPU_VOP3__V_FMA_F64,
-        &Decoder::decode_OPU_VOP3__V_LERP_U8,
-        &Decoder::decode_OPU_VOP3__V_ALIGNBIT_B32,
-        &Decoder::decode_OPU_VOP3__V_ALIGNBYTE_B32,
-        &Decoder::decode_OPU_VOP3__V_MIN3_F32,
-        &Decoder::decode_OPU_VOP3__V_MIN3_I32,
-        &Decoder::decode_OPU_VOP3__V_MIN3_U32,
-        &Decoder::decode_OPU_VOP3__V_MAX3_F32,
-        &Decoder::decode_OPU_VOP3__V_MAX3_I32,
-        &Decoder::decode_OPU_VOP3__V_MAX3_U32,
-        &Decoder::decode_OPU_VOP3__V_MED3_F32,
-        &Decoder::decode_OPU_VOP3__V_MED3_I32,
-        &Decoder::decode_OPU_VOP3__V_MED3_U32,
-        &Decoder::decode_OPU_VOP3__V_SAD_U8,
-        &Decoder::decode_OPU_VOP3__V_SAD_HI_U8,
-        &Decoder::decode_OPU_VOP3__V_SAD_U16,
-        &Decoder::decode_OPU_VOP3__V_SAD_U32,
-        &Decoder::decode_OPU_VOP3__V_CVT_PK_U8_F32,
-        &Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F32,
-        &Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F64,
-        &Decoder::decode_OPU_VOP3__V_DIV_SCALE_F32,
-        &Decoder::decode_OPU_VOP3__V_DIV_SCALE_F64,
-        &Decoder::decode_OPU_VOP3__V_DIV_FMAS_F32,
-        &Decoder::decode_OPU_VOP3__V_DIV_FMAS_F64,
-        &Decoder::decode_OPU_VOP3__V_MSAD_U8,
-        &Decoder::decode_OPU_VOP3__V_QSAD_PK_U16_U8,
-        &Decoder::decode_OPU_VOP3__V_MQSAD_PK_U16_U8,
-        &Decoder::decode_OPU_VOP3__V_MQSAD_U32_U8,
-        &Decoder::decode_OPU_VOP3__V_MAD_U64_U32,
-        &Decoder::decode_OPU_VOP3__V_MAD_I64_I32,
-        &Decoder::decode_OPU_VOP3__V_MAD_F16,
-        &Decoder::decode_OPU_VOP3__V_MAD_U16,
-        &Decoder::decode_OPU_VOP3__V_MAD_I16,
-        &Decoder::decode_OPU_VOP3__V_PERM_B32,
-        &Decoder::decode_OPU_VOP3__V_FMA_F16,
-        &Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F16,
-        &Decoder::decode_OPU_VOP3__V_CVT_PKACCUM_U8_F32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_INTERP_P1_F32,
-        &Decoder::decode_OPU_VOP3__V_INTERP_P2_F32,
-        &Decoder::decode_OPU_VOP3__V_INTERP_MOV_F32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_INTERP_P1LL_F16,
-        &Decoder::decode_OPU_VOP3__V_INTERP_P1LV_F16,
-        &Decoder::decode_OPU_VOP3__V_INTERP_P2_F16,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_ADD_F64,
-        &Decoder::decode_OPU_VOP3__V_MUL_F64,
-        &Decoder::decode_OPU_VOP3__V_MIN_F64,
-        &Decoder::decode_OPU_VOP3__V_MAX_F64,
-        &Decoder::decode_OPU_VOP3__V_LDEXP_F64,
-        &Decoder::decode_OPU_VOP3__V_MUL_LO_U32,
-        &Decoder::decode_OPU_VOP3__V_MUL_HI_U32,
-        &Decoder::decode_OPU_VOP3__V_MUL_HI_I32,
-        &Decoder::decode_OPU_VOP3__V_LDEXP_F32,
-        &Decoder::decode_OPU_VOP3__V_READLANE_B32,
-        &Decoder::decode_OPU_VOP3__V_WRITELANE_B32,
-        &Decoder::decode_OPU_VOP3__V_BCNT_U32_B32,
-        &Decoder::decode_OPU_VOP3__V_MBCNT_LO_U32_B32,
-        &Decoder::decode_OPU_VOP3__V_MBCNT_HI_U32_B32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_LSHLREV_B64,
-        &Decoder::decode_OPU_VOP3__V_LSHRREV_B64,
-        &Decoder::decode_OPU_VOP3__V_ASHRREV_I64,
-        &Decoder::decode_OPU_VOP3__V_TRIG_PREOP_F64,
-        &Decoder::decode_OPU_VOP3__V_BFM_B32,
-        &Decoder::decode_OPU_VOP3__V_CVT_PKNORM_I16_F32,
-        &Decoder::decode_OPU_VOP3__V_CVT_PKNORM_U16_F32,
-        &Decoder::decode_OPU_VOP3__V_CVT_PKRTZ_F16_F32,
-        &Decoder::decode_OPU_VOP3__V_CVT_PK_U16_U32,
-        &Decoder::decode_OPU_VOP3__V_CVT_PK_I16_I32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_DS[] = {
-        &Decoder::decode_OP_DS__DS_ADD_U32,
-        &Decoder::decode_OP_DS__DS_SUB_U32,
-        &Decoder::decode_OP_DS__DS_RSUB_U32,
-        &Decoder::decode_OP_DS__DS_INC_U32,
-        &Decoder::decode_OP_DS__DS_DEC_U32,
-        &Decoder::decode_OP_DS__DS_MIN_I32,
-        &Decoder::decode_OP_DS__DS_MAX_I32,
-        &Decoder::decode_OP_DS__DS_MIN_U32,
-        &Decoder::decode_OP_DS__DS_MAX_U32,
-        &Decoder::decode_OP_DS__DS_AND_B32,
-        &Decoder::decode_OP_DS__DS_OR_B32,
-        &Decoder::decode_OP_DS__DS_XOR_B32,
-        &Decoder::decode_OP_DS__DS_MSKOR_B32,
-        &Decoder::decode_OP_DS__DS_WRITE_B32,
-        &Decoder::decode_OP_DS__DS_WRITE2_B32,
-        &Decoder::decode_OP_DS__DS_WRITE2ST64_B32,
-        &Decoder::decode_OP_DS__DS_CMPST_B32,
-        &Decoder::decode_OP_DS__DS_CMPST_F32,
-        &Decoder::decode_OP_DS__DS_MIN_F32,
-        &Decoder::decode_OP_DS__DS_MAX_F32,
-        &Decoder::decode_OP_DS__DS_NOP,
-        &Decoder::decode_OP_DS__DS_ADD_F32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_WRITE_B8,
-        &Decoder::decode_OP_DS__DS_WRITE_B16,
-        &Decoder::decode_OP_DS__DS_ADD_RTN_U32,
-        &Decoder::decode_OP_DS__DS_SUB_RTN_U32,
-        &Decoder::decode_OP_DS__DS_RSUB_RTN_U32,
-        &Decoder::decode_OP_DS__DS_INC_RTN_U32,
-        &Decoder::decode_OP_DS__DS_DEC_RTN_U32,
-        &Decoder::decode_OP_DS__DS_MIN_RTN_I32,
-        &Decoder::decode_OP_DS__DS_MAX_RTN_I32,
-        &Decoder::decode_OP_DS__DS_MIN_RTN_U32,
-        &Decoder::decode_OP_DS__DS_MAX_RTN_U32,
-        &Decoder::decode_OP_DS__DS_AND_RTN_B32,
-        &Decoder::decode_OP_DS__DS_OR_RTN_B32,
-        &Decoder::decode_OP_DS__DS_XOR_RTN_B32,
-        &Decoder::decode_OP_DS__DS_MSKOR_RTN_B32,
-        &Decoder::decode_OP_DS__DS_WRXCHG_RTN_B32,
-        &Decoder::decode_OP_DS__DS_WRXCHG2_RTN_B32,
-        &Decoder::decode_OP_DS__DS_WRXCHG2ST64_RTN_B32,
-        &Decoder::decode_OP_DS__DS_CMPST_RTN_B32,
-        &Decoder::decode_OP_DS__DS_CMPST_RTN_F32,
-        &Decoder::decode_OP_DS__DS_MIN_RTN_F32,
-        &Decoder::decode_OP_DS__DS_MAX_RTN_F32,
-        &Decoder::decode_OP_DS__DS_WRAP_RTN_B32,
-        &Decoder::decode_OP_DS__DS_ADD_RTN_F32,
-        &Decoder::decode_OP_DS__DS_READ_B32,
-        &Decoder::decode_OP_DS__DS_READ2_B32,
-        &Decoder::decode_OP_DS__DS_READ2ST64_B32,
-        &Decoder::decode_OP_DS__DS_READ_I8,
-        &Decoder::decode_OP_DS__DS_READ_U8,
-        &Decoder::decode_OP_DS__DS_READ_I16,
-        &Decoder::decode_OP_DS__DS_READ_U16,
-        &Decoder::decode_OP_DS__DS_SWIZZLE_B32,
-        &Decoder::decode_OP_DS__DS_PERMUTE_B32,
-        &Decoder::decode_OP_DS__DS_BPERMUTE_B32,
-        &Decoder::decode_OP_DS__DS_ADD_U64,
-        &Decoder::decode_OP_DS__DS_SUB_U64,
-        &Decoder::decode_OP_DS__DS_RSUB_U64,
-        &Decoder::decode_OP_DS__DS_INC_U64,
-        &Decoder::decode_OP_DS__DS_DEC_U64,
-        &Decoder::decode_OP_DS__DS_MIN_I64,
-        &Decoder::decode_OP_DS__DS_MAX_I64,
-        &Decoder::decode_OP_DS__DS_MIN_U64,
-        &Decoder::decode_OP_DS__DS_MAX_U64,
-        &Decoder::decode_OP_DS__DS_AND_B64,
-        &Decoder::decode_OP_DS__DS_OR_B64,
-        &Decoder::decode_OP_DS__DS_XOR_B64,
-        &Decoder::decode_OP_DS__DS_MSKOR_B64,
-        &Decoder::decode_OP_DS__DS_WRITE_B64,
-        &Decoder::decode_OP_DS__DS_WRITE2_B64,
-        &Decoder::decode_OP_DS__DS_WRITE2ST64_B64,
-        &Decoder::decode_OP_DS__DS_CMPST_B64,
-        &Decoder::decode_OP_DS__DS_CMPST_F64,
-        &Decoder::decode_OP_DS__DS_MIN_F64,
-        &Decoder::decode_OP_DS__DS_MAX_F64,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_ADD_RTN_U64,
-        &Decoder::decode_OP_DS__DS_SUB_RTN_U64,
-        &Decoder::decode_OP_DS__DS_RSUB_RTN_U64,
-        &Decoder::decode_OP_DS__DS_INC_RTN_U64,
-        &Decoder::decode_OP_DS__DS_DEC_RTN_U64,
-        &Decoder::decode_OP_DS__DS_MIN_RTN_I64,
-        &Decoder::decode_OP_DS__DS_MAX_RTN_I64,
-        &Decoder::decode_OP_DS__DS_MIN_RTN_U64,
-        &Decoder::decode_OP_DS__DS_MAX_RTN_U64,
-        &Decoder::decode_OP_DS__DS_AND_RTN_B64,
-        &Decoder::decode_OP_DS__DS_OR_RTN_B64,
-        &Decoder::decode_OP_DS__DS_XOR_RTN_B64,
-        &Decoder::decode_OP_DS__DS_MSKOR_RTN_B64,
-        &Decoder::decode_OP_DS__DS_WRXCHG_RTN_B64,
-        &Decoder::decode_OP_DS__DS_WRXCHG2_RTN_B64,
-        &Decoder::decode_OP_DS__DS_WRXCHG2ST64_RTN_B64,
-        &Decoder::decode_OP_DS__DS_CMPST_RTN_B64,
-        &Decoder::decode_OP_DS__DS_CMPST_RTN_F64,
-        &Decoder::decode_OP_DS__DS_MIN_RTN_F64,
-        &Decoder::decode_OP_DS__DS_MAX_RTN_F64,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_READ_B64,
-        &Decoder::decode_OP_DS__DS_READ2_B64,
-        &Decoder::decode_OP_DS__DS_READ2ST64_B64,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_CONDXCHG32_RTN_B64,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_ADD_SRC2_U32,
-        &Decoder::decode_OP_DS__DS_SUB_SRC2_U32,
-        &Decoder::decode_OP_DS__DS_RSUB_SRC2_U32,
-        &Decoder::decode_OP_DS__DS_INC_SRC2_U32,
-        &Decoder::decode_OP_DS__DS_DEC_SRC2_U32,
-        &Decoder::decode_OP_DS__DS_MIN_SRC2_I32,
-        &Decoder::decode_OP_DS__DS_MAX_SRC2_I32,
-        &Decoder::decode_OP_DS__DS_MIN_SRC2_U32,
-        &Decoder::decode_OP_DS__DS_MAX_SRC2_U32,
-        &Decoder::decode_OP_DS__DS_AND_SRC2_B32,
-        &Decoder::decode_OP_DS__DS_OR_SRC2_B32,
-        &Decoder::decode_OP_DS__DS_XOR_SRC2_B32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_WRITE_SRC2_B32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_MIN_SRC2_F32,
-        &Decoder::decode_OP_DS__DS_MAX_SRC2_F32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_ADD_SRC2_F32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_GWS_SEMA_RELEASE_ALL,
-        &Decoder::decode_OP_DS__DS_GWS_INIT,
-        &Decoder::decode_OP_DS__DS_GWS_SEMA_V,
-        &Decoder::decode_OP_DS__DS_GWS_SEMA_BR,
-        &Decoder::decode_OP_DS__DS_GWS_SEMA_P,
-        &Decoder::decode_OP_DS__DS_GWS_BARRIER,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_CONSUME,
-        &Decoder::decode_OP_DS__DS_APPEND,
-        &Decoder::decode_OP_DS__DS_ORDERED_COUNT,
-        &Decoder::decode_OP_DS__DS_ADD_SRC2_U64,
-        &Decoder::decode_OP_DS__DS_SUB_SRC2_U64,
-        &Decoder::decode_OP_DS__DS_RSUB_SRC2_U64,
-        &Decoder::decode_OP_DS__DS_INC_SRC2_U64,
-        &Decoder::decode_OP_DS__DS_DEC_SRC2_U64,
-        &Decoder::decode_OP_DS__DS_MIN_SRC2_I64,
-        &Decoder::decode_OP_DS__DS_MAX_SRC2_I64,
-        &Decoder::decode_OP_DS__DS_MIN_SRC2_U64,
-        &Decoder::decode_OP_DS__DS_MAX_SRC2_U64,
-        &Decoder::decode_OP_DS__DS_AND_SRC2_B64,
-        &Decoder::decode_OP_DS__DS_OR_SRC2_B64,
-        &Decoder::decode_OP_DS__DS_XOR_SRC2_B64,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_WRITE_SRC2_B64,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_MIN_SRC2_F64,
-        &Decoder::decode_OP_DS__DS_MAX_SRC2_F64,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_WRITE_B96,
-        &Decoder::decode_OP_DS__DS_WRITE_B128,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_DS__DS_READ_B96,
-        &Decoder::decode_OP_DS__DS_READ_B128
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_FLAT[] = {
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_FLAT__FLAT_LOAD_UBYTE,
-        &Decoder::decode_OP_FLAT__FLAT_LOAD_SBYTE,
-        &Decoder::decode_OP_FLAT__FLAT_LOAD_USHORT,
-        &Decoder::decode_OP_FLAT__FLAT_LOAD_SSHORT,
-        &Decoder::decode_OP_FLAT__FLAT_LOAD_DWORD,
-        &Decoder::decode_OP_FLAT__FLAT_LOAD_DWORDX2,
-        &Decoder::decode_OP_FLAT__FLAT_LOAD_DWORDX3,
-        &Decoder::decode_OP_FLAT__FLAT_LOAD_DWORDX4,
-        &Decoder::decode_OP_FLAT__FLAT_STORE_BYTE,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_FLAT__FLAT_STORE_SHORT,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_FLAT__FLAT_STORE_DWORD,
-        &Decoder::decode_OP_FLAT__FLAT_STORE_DWORDX2,
-        &Decoder::decode_OP_FLAT__FLAT_STORE_DWORDX3,
-        &Decoder::decode_OP_FLAT__FLAT_STORE_DWORDX4,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_SWAP,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_ADD,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_SUB,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_SMIN,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_UMIN,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_SMAX,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_UMAX,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_AND,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_OR,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_XOR,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_INC,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_DEC,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_SWAP_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_ADD_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_SUB_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_SMIN_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_UMIN_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_SMAX_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_UMAX_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_AND_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_OR_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_XOR_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_INC_X2,
-        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_DEC_X2,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_MIMG[] = {
-        &Decoder::decode_OP_MIMG__IMAGE_LOAD,
-        &Decoder::decode_OP_MIMG__IMAGE_LOAD_MIP,
-        &Decoder::decode_OP_MIMG__IMAGE_LOAD_PCK,
-        &Decoder::decode_OP_MIMG__IMAGE_LOAD_PCK_SGN,
-        &Decoder::decode_OP_MIMG__IMAGE_LOAD_MIP_PCK,
-        &Decoder::decode_OP_MIMG__IMAGE_LOAD_MIP_PCK_SGN,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MIMG__IMAGE_STORE,
-        &Decoder::decode_OP_MIMG__IMAGE_STORE_MIP,
-        &Decoder::decode_OP_MIMG__IMAGE_STORE_PCK,
-        &Decoder::decode_OP_MIMG__IMAGE_STORE_MIP_PCK,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MIMG__IMAGE_GET_RESINFO,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_SWAP,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_CMPSWAP,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_ADD,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_SUB,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_SMIN,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_UMIN,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_SMAX,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_UMAX,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_AND,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_OR,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_XOR,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_INC,
-        &Decoder::decode_OP_MIMG__IMAGE_ATOMIC_DEC,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CL,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_D,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_D_CL,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_L,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_B,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_B_CL,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_LZ,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CL,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_D,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_D_CL,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_L,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_B,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_B_CL,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_LZ,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CL_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_D_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_D_CL_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_L_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_B_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_B_CL_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_LZ_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CL_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_D_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_D_CL_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_L_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_B_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_B_CL_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_LZ_O,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_CL,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_L,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_B,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_B_CL,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_LZ,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_CL,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_L,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_B,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_B_CL,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_LZ,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_O,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_CL_O,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_L_O,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_B_O,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_B_CL_O,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_LZ_O,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_O,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_CL_O,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_L_O,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_B_O,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_B_CL_O,
-        &Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_LZ_O,
-        &Decoder::decode_OP_MIMG__IMAGE_GET_LOD,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CD,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CD_CL,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CD,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CD_CL,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CD_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CD_CL_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CD_O,
-        &Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CD_CL_O,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_MTBUF[] = {
-        &Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_X,
-        &Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XY,
-        &Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XYZ,
-        &Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XYZW,
-        &Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_X,
-        &Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XY,
-        &Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XYZ,
-        &Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XYZW,
-        &Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_X,
-        &Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY,
-        &Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ,
-        &Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW,
-        &Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_X,
-        &Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XY,
-        &Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ,
-        &Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_MUBUF[] = {
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_X,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XY,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XYZ,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XYZW,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_X,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_XY,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_XYZ,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_XYZW,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_X,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XY,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_X,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XY,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_UBYTE,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_SBYTE,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_USHORT,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_SSHORT,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_DWORD,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_DWORDX2,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_DWORDX3,
-        &Decoder::decode_OP_MUBUF__BUFFER_LOAD_DWORDX4,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_BYTE,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_SHORT,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_DWORD,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_DWORDX2,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_DWORDX3,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_DWORDX4,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MUBUF__BUFFER_STORE_LDS_DWORD,
-        &Decoder::decode_OP_MUBUF__BUFFER_WBINVL1,
-        &Decoder::decode_OP_MUBUF__BUFFER_WBINVL1_VOL,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SWAP,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_CMPSWAP,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_ADD,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SUB,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SMIN,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_UMIN,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SMAX,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_UMAX,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_AND,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_OR,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_XOR,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_INC,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_DEC,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SWAP_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_ADD_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SUB_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SMIN_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_UMIN_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SMAX_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_UMAX_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_AND_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_OR_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_XOR_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_INC_X2,
-        &Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_DEC_X2,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_SMEM[] = {
-        &Decoder::decode_OP_SMEM__S_LOAD_DWORD,
-        &Decoder::decode_OP_SMEM__S_LOAD_DWORDX2,
-        &Decoder::decode_OP_SMEM__S_LOAD_DWORDX4,
-        &Decoder::decode_OP_SMEM__S_LOAD_DWORDX8,
-        &Decoder::decode_OP_SMEM__S_LOAD_DWORDX16,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_SMEM__S_BUFFER_LOAD_DWORD,
-        &Decoder::decode_OP_SMEM__S_BUFFER_LOAD_DWORDX2,
-        &Decoder::decode_OP_SMEM__S_BUFFER_LOAD_DWORDX4,
-        &Decoder::decode_OP_SMEM__S_BUFFER_LOAD_DWORDX8,
-        &Decoder::decode_OP_SMEM__S_BUFFER_LOAD_DWORDX16,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_SMEM__S_STORE_DWORD,
-        &Decoder::decode_OP_SMEM__S_STORE_DWORDX2,
-        &Decoder::decode_OP_SMEM__S_STORE_DWORDX4,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_SMEM__S_BUFFER_STORE_DWORD,
-        &Decoder::decode_OP_SMEM__S_BUFFER_STORE_DWORDX2,
-        &Decoder::decode_OP_SMEM__S_BUFFER_STORE_DWORDX4,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_SMEM__S_DCACHE_INV,
-        &Decoder::decode_OP_SMEM__S_DCACHE_WB,
-        &Decoder::decode_OP_SMEM__S_DCACHE_INV_VOL,
-        &Decoder::decode_OP_SMEM__S_DCACHE_WB_VOL,
-        &Decoder::decode_OP_SMEM__S_MEMTIME,
-        &Decoder::decode_OP_SMEM__S_MEMREALTIME,
-        &Decoder::decode_OP_SMEM__S_ATC_PROBE,
-        &Decoder::decode_OP_SMEM__S_ATC_PROBE_BUFFER,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_SOP1[] = {
-        &Decoder::decode_OP_SOP1__S_MOV_B32,
-        &Decoder::decode_OP_SOP1__S_MOV_B64,
-        &Decoder::decode_OP_SOP1__S_CMOV_B32,
-        &Decoder::decode_OP_SOP1__S_CMOV_B64,
-        &Decoder::decode_OP_SOP1__S_NOT_B32,
-        &Decoder::decode_OP_SOP1__S_NOT_B64,
-        &Decoder::decode_OP_SOP1__S_WQM_B32,
-        &Decoder::decode_OP_SOP1__S_WQM_B64,
-        &Decoder::decode_OP_SOP1__S_BREV_B32,
-        &Decoder::decode_OP_SOP1__S_BREV_B64,
-        &Decoder::decode_OP_SOP1__S_BCNT0_I32_B32,
-        &Decoder::decode_OP_SOP1__S_BCNT0_I32_B64,
-        &Decoder::decode_OP_SOP1__S_BCNT1_I32_B32,
-        &Decoder::decode_OP_SOP1__S_BCNT1_I32_B64,
-        &Decoder::decode_OP_SOP1__S_FF0_I32_B32,
-        &Decoder::decode_OP_SOP1__S_FF0_I32_B64,
-        &Decoder::decode_OP_SOP1__S_FF1_I32_B32,
-        &Decoder::decode_OP_SOP1__S_FF1_I32_B64,
-        &Decoder::decode_OP_SOP1__S_FLBIT_I32_B32,
-        &Decoder::decode_OP_SOP1__S_FLBIT_I32_B64,
-        &Decoder::decode_OP_SOP1__S_FLBIT_I32,
-        &Decoder::decode_OP_SOP1__S_FLBIT_I32_I64,
-        &Decoder::decode_OP_SOP1__S_SEXT_I32_I8,
-        &Decoder::decode_OP_SOP1__S_SEXT_I32_I16,
-        &Decoder::decode_OP_SOP1__S_BITSET0_B32,
-        &Decoder::decode_OP_SOP1__S_BITSET0_B64,
-        &Decoder::decode_OP_SOP1__S_BITSET1_B32,
-        &Decoder::decode_OP_SOP1__S_BITSET1_B64,
-        &Decoder::decode_OP_SOP1__S_GETPC_B64,
-        &Decoder::decode_OP_SOP1__S_SETPC_B64,
-        &Decoder::decode_OP_SOP1__S_SWAPPC_B64,
-        &Decoder::decode_OP_SOP1__S_RFE_B64,
-        &Decoder::decode_OP_SOP1__S_AND_SAVEEXEC_B64,
-        &Decoder::decode_OP_SOP1__S_OR_SAVEEXEC_B64,
-        &Decoder::decode_OP_SOP1__S_XOR_SAVEEXEC_B64,
-        &Decoder::decode_OP_SOP1__S_ANDN2_SAVEEXEC_B64,
-        &Decoder::decode_OP_SOP1__S_ORN2_SAVEEXEC_B64,
-        &Decoder::decode_OP_SOP1__S_NAND_SAVEEXEC_B64,
-        &Decoder::decode_OP_SOP1__S_NOR_SAVEEXEC_B64,
-        &Decoder::decode_OP_SOP1__S_XNOR_SAVEEXEC_B64,
-        &Decoder::decode_OP_SOP1__S_QUADMASK_B32,
-        &Decoder::decode_OP_SOP1__S_QUADMASK_B64,
-        &Decoder::decode_OP_SOP1__S_MOVRELS_B32,
-        &Decoder::decode_OP_SOP1__S_MOVRELS_B64,
-        &Decoder::decode_OP_SOP1__S_MOVRELD_B32,
-        &Decoder::decode_OP_SOP1__S_MOVRELD_B64,
-        &Decoder::decode_OP_SOP1__S_CBRANCH_JOIN,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_SOP1__S_ABS_I32,
-        &Decoder::decode_OP_SOP1__S_MOV_FED_B32,
-        &Decoder::decode_OP_SOP1__S_SET_GPR_IDX_IDX,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_SOPC[] = {
-        &Decoder::decode_OP_SOPC__S_CMP_EQ_I32,
-        &Decoder::decode_OP_SOPC__S_CMP_LG_I32,
-        &Decoder::decode_OP_SOPC__S_CMP_GT_I32,
-        &Decoder::decode_OP_SOPC__S_CMP_GE_I32,
-        &Decoder::decode_OP_SOPC__S_CMP_LT_I32,
-        &Decoder::decode_OP_SOPC__S_CMP_LE_I32,
-        &Decoder::decode_OP_SOPC__S_CMP_EQ_U32,
-        &Decoder::decode_OP_SOPC__S_CMP_LG_U32,
-        &Decoder::decode_OP_SOPC__S_CMP_GT_U32,
-        &Decoder::decode_OP_SOPC__S_CMP_GE_U32,
-        &Decoder::decode_OP_SOPC__S_CMP_LT_U32,
-        &Decoder::decode_OP_SOPC__S_CMP_LE_U32,
-        &Decoder::decode_OP_SOPC__S_BITCMP0_B32,
-        &Decoder::decode_OP_SOPC__S_BITCMP1_B32,
-        &Decoder::decode_OP_SOPC__S_BITCMP0_B64,
-        &Decoder::decode_OP_SOPC__S_BITCMP1_B64,
-        &Decoder::decode_OP_SOPC__S_SETVSKIP,
-        &Decoder::decode_OP_SOPC__S_SET_GPR_IDX_ON,
-        &Decoder::decode_OP_SOPC__S_CMP_EQ_U64,
-        &Decoder::decode_OP_SOPC__S_CMP_LG_U64,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_SOPP[] = {
-        &Decoder::decode_OP_SOPP__S_NOP,
-        &Decoder::decode_OP_SOPP__S_ENDPGM,
-        &Decoder::decode_OP_SOPP__S_BRANCH,
-        &Decoder::decode_OP_SOPP__S_WAKEUP,
-        &Decoder::decode_OP_SOPP__S_CBRANCH_SCC0,
-        &Decoder::decode_OP_SOPP__S_CBRANCH_SCC1,
-        &Decoder::decode_OP_SOPP__S_CBRANCH_VCCZ,
-        &Decoder::decode_OP_SOPP__S_CBRANCH_VCCNZ,
-        &Decoder::decode_OP_SOPP__S_CBRANCH_EXECZ,
-        &Decoder::decode_OP_SOPP__S_CBRANCH_EXECNZ,
-        &Decoder::decode_OP_SOPP__S_BARRIER,
-        &Decoder::decode_OP_SOPP__S_SETKILL,
-        &Decoder::decode_OP_SOPP__S_WAITCNT,
-        &Decoder::decode_OP_SOPP__S_SETHALT,
-        &Decoder::decode_OP_SOPP__S_SLEEP,
-        &Decoder::decode_OP_SOPP__S_SETPRIO,
-        &Decoder::decode_OP_SOPP__S_SENDMSG,
-        &Decoder::decode_OP_SOPP__S_SENDMSGHALT,
-        &Decoder::decode_OP_SOPP__S_TRAP,
-        &Decoder::decode_OP_SOPP__S_ICACHE_INV,
-        &Decoder::decode_OP_SOPP__S_INCPERFLEVEL,
-        &Decoder::decode_OP_SOPP__S_DECPERFLEVEL,
-        &Decoder::decode_OP_SOPP__S_TTRACEDATA,
-        &Decoder::decode_OP_SOPP__S_CBRANCH_CDBGSYS,
-        &Decoder::decode_OP_SOPP__S_CBRANCH_CDBGUSER,
-        &Decoder::decode_OP_SOPP__S_CBRANCH_CDBGSYS_OR_USER,
-        &Decoder::decode_OP_SOPP__S_CBRANCH_CDBGSYS_AND_USER,
-        &Decoder::decode_OP_SOPP__S_ENDPGM_SAVED,
-        &Decoder::decode_OP_SOPP__S_SET_GPR_IDX_OFF,
-        &Decoder::decode_OP_SOPP__S_SET_GPR_IDX_MODE,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_VINTRP[] = {
-        &Decoder::decode_OP_VINTRP__V_INTERP_P1_F32,
-        &Decoder::decode_OP_VINTRP__V_INTERP_P2_F32,
-        &Decoder::decode_OP_VINTRP__V_INTERP_MOV_F32,
-        &Decoder::decode_invalid
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_VOP1[] = {
-        &Decoder::decode_OP_VOP1__V_NOP,
-        &Decoder::decode_OP_VOP1__V_MOV_B32,
-        &Decoder::decode_OP_VOP1__V_READFIRSTLANE_B32,
-        &Decoder::decode_OP_VOP1__V_CVT_I32_F64,
-        &Decoder::decode_OP_VOP1__V_CVT_F64_I32,
-        &Decoder::decode_OP_VOP1__V_CVT_F32_I32,
-        &Decoder::decode_OP_VOP1__V_CVT_F32_U32,
-        &Decoder::decode_OP_VOP1__V_CVT_U32_F32,
-        &Decoder::decode_OP_VOP1__V_CVT_I32_F32,
-        &Decoder::decode_OP_VOP1__V_MOV_FED_B32,
-        &Decoder::decode_OP_VOP1__V_CVT_F16_F32,
-        &Decoder::decode_OP_VOP1__V_CVT_F32_F16,
-        &Decoder::decode_OP_VOP1__V_CVT_RPI_I32_F32,
-        &Decoder::decode_OP_VOP1__V_CVT_FLR_I32_F32,
-        &Decoder::decode_OP_VOP1__V_CVT_OFF_F32_I4,
-        &Decoder::decode_OP_VOP1__V_CVT_F32_F64,
-        &Decoder::decode_OP_VOP1__V_CVT_F64_F32,
-        &Decoder::decode_OP_VOP1__V_CVT_F32_UBYTE0,
-        &Decoder::decode_OP_VOP1__V_CVT_F32_UBYTE1,
-        &Decoder::decode_OP_VOP1__V_CVT_F32_UBYTE2,
-        &Decoder::decode_OP_VOP1__V_CVT_F32_UBYTE3,
-        &Decoder::decode_OP_VOP1__V_CVT_U32_F64,
-        &Decoder::decode_OP_VOP1__V_CVT_F64_U32,
-        &Decoder::decode_OP_VOP1__V_TRUNC_F64,
-        &Decoder::decode_OP_VOP1__V_CEIL_F64,
-        &Decoder::decode_OP_VOP1__V_RNDNE_F64,
-        &Decoder::decode_OP_VOP1__V_FLOOR_F64,
-        &Decoder::decode_OP_VOP1__V_FRACT_F32,
-        &Decoder::decode_OP_VOP1__V_TRUNC_F32,
-        &Decoder::decode_OP_VOP1__V_CEIL_F32,
-        &Decoder::decode_OP_VOP1__V_RNDNE_F32,
-        &Decoder::decode_OP_VOP1__V_FLOOR_F32,
-        &Decoder::decode_OP_VOP1__V_EXP_F32,
-        &Decoder::decode_OP_VOP1__V_LOG_F32,
-        &Decoder::decode_OP_VOP1__V_RCP_F32,
-        &Decoder::decode_OP_VOP1__V_RCP_IFLAG_F32,
-        &Decoder::decode_OP_VOP1__V_RSQ_F32,
-        &Decoder::decode_OP_VOP1__V_RCP_F64,
-        &Decoder::decode_OP_VOP1__V_RSQ_F64,
-        &Decoder::decode_OP_VOP1__V_SQRT_F32,
-        &Decoder::decode_OP_VOP1__V_SQRT_F64,
-        &Decoder::decode_OP_VOP1__V_SIN_F32,
-        &Decoder::decode_OP_VOP1__V_COS_F32,
-        &Decoder::decode_OP_VOP1__V_NOT_B32,
-        &Decoder::decode_OP_VOP1__V_BFREV_B32,
-        &Decoder::decode_OP_VOP1__V_FFBH_U32,
-        &Decoder::decode_OP_VOP1__V_FFBL_B32,
-        &Decoder::decode_OP_VOP1__V_FFBH_I32,
-        &Decoder::decode_OP_VOP1__V_FREXP_EXP_I32_F64,
-        &Decoder::decode_OP_VOP1__V_FREXP_MANT_F64,
-        &Decoder::decode_OP_VOP1__V_FRACT_F64,
-        &Decoder::decode_OP_VOP1__V_FREXP_EXP_I32_F32,
-        &Decoder::decode_OP_VOP1__V_FREXP_MANT_F32,
-        &Decoder::decode_OP_VOP1__V_CLREXCP,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_VOP1__V_CVT_F16_U16,
-        &Decoder::decode_OP_VOP1__V_CVT_F16_I16,
-        &Decoder::decode_OP_VOP1__V_CVT_U16_F16,
-        &Decoder::decode_OP_VOP1__V_CVT_I16_F16,
-        &Decoder::decode_OP_VOP1__V_RCP_F16,
-        &Decoder::decode_OP_VOP1__V_SQRT_F16,
-        &Decoder::decode_OP_VOP1__V_RSQ_F16,
-        &Decoder::decode_OP_VOP1__V_LOG_F16,
-        &Decoder::decode_OP_VOP1__V_EXP_F16,
-        &Decoder::decode_OP_VOP1__V_FREXP_MANT_F16,
-        &Decoder::decode_OP_VOP1__V_FREXP_EXP_I16_F16,
-        &Decoder::decode_OP_VOP1__V_FLOOR_F16,
-        &Decoder::decode_OP_VOP1__V_CEIL_F16,
-        &Decoder::decode_OP_VOP1__V_TRUNC_F16,
-        &Decoder::decode_OP_VOP1__V_RNDNE_F16,
-        &Decoder::decode_OP_VOP1__V_FRACT_F16,
-        &Decoder::decode_OP_VOP1__V_SIN_F16,
-        &Decoder::decode_OP_VOP1__V_COS_F16,
-        &Decoder::decode_OP_VOP1__V_EXP_LEGACY_F32,
-        &Decoder::decode_OP_VOP1__V_LOG_LEGACY_F32,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid
-    };
-
-    IsaDecodeMethod Decoder::tableSubDecode_OP_VOPC[] = {
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_VOPC__V_CMP_CLASS_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_CLASS_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_CLASS_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_CLASS_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_CLASS_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_CLASS_F16,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_VOPC__V_CMP_F_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_LT_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_EQ_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_LE_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_GT_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_LG_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_GE_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_O_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_U_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_NGE_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_NLG_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_NGT_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_NLE_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_NEQ_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_NLT_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_TRU_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_F_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_LT_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_EQ_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_LE_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_GT_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_LG_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_GE_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_O_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_U_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_NGE_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_NLG_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_NGT_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_NLE_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_NEQ_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_NLT_F16,
-        &Decoder::decode_OP_VOPC__V_CMPX_TRU_F16,
-        &Decoder::decode_OP_VOPC__V_CMP_F_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_LT_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_EQ_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_LE_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_GT_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_LG_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_GE_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_O_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_U_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_NGE_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_NLG_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_NGT_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_NLE_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_NEQ_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_NLT_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_TRU_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_F_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_LT_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_EQ_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_LE_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_GT_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_LG_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_GE_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_O_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_U_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_NGE_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_NLG_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_NGT_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_NLE_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_NEQ_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_NLT_F32,
-        &Decoder::decode_OP_VOPC__V_CMPX_TRU_F32,
-        &Decoder::decode_OP_VOPC__V_CMP_F_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_LT_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_EQ_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_LE_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_GT_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_LG_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_GE_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_O_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_U_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_NGE_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_NLG_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_NGT_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_NLE_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_NEQ_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_NLT_F64,
-        &Decoder::decode_OP_VOPC__V_CMP_TRU_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_F_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_LT_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_EQ_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_LE_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_GT_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_LG_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_GE_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_O_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_U_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_NGE_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_NLG_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_NGT_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_NLE_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_NEQ_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_NLT_F64,
-        &Decoder::decode_OP_VOPC__V_CMPX_TRU_F64,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_OP_VOPC__V_CMP_F_I16,
-        &Decoder::decode_OP_VOPC__V_CMP_LT_I16,
-        &Decoder::decode_OP_VOPC__V_CMP_EQ_I16,
-        &Decoder::decode_OP_VOPC__V_CMP_LE_I16,
-        &Decoder::decode_OP_VOPC__V_CMP_GT_I16,
-        &Decoder::decode_OP_VOPC__V_CMP_NE_I16,
-        &Decoder::decode_OP_VOPC__V_CMP_GE_I16,
-        &Decoder::decode_OP_VOPC__V_CMP_T_I16,
-        &Decoder::decode_OP_VOPC__V_CMP_F_U16,
-        &Decoder::decode_OP_VOPC__V_CMP_LT_U16,
-        &Decoder::decode_OP_VOPC__V_CMP_EQ_U16,
-        &Decoder::decode_OP_VOPC__V_CMP_LE_U16,
-        &Decoder::decode_OP_VOPC__V_CMP_GT_U16,
-        &Decoder::decode_OP_VOPC__V_CMP_NE_U16,
-        &Decoder::decode_OP_VOPC__V_CMP_GE_U16,
-        &Decoder::decode_OP_VOPC__V_CMP_T_U16,
-        &Decoder::decode_OP_VOPC__V_CMPX_F_I16,
-        &Decoder::decode_OP_VOPC__V_CMPX_LT_I16,
-        &Decoder::decode_OP_VOPC__V_CMPX_EQ_I16,
-        &Decoder::decode_OP_VOPC__V_CMPX_LE_I16,
-        &Decoder::decode_OP_VOPC__V_CMPX_GT_I16,
-        &Decoder::decode_OP_VOPC__V_CMPX_NE_I16,
-        &Decoder::decode_OP_VOPC__V_CMPX_GE_I16,
-        &Decoder::decode_OP_VOPC__V_CMPX_T_I16,
-        &Decoder::decode_OP_VOPC__V_CMPX_F_U16,
-        &Decoder::decode_OP_VOPC__V_CMPX_LT_U16,
-        &Decoder::decode_OP_VOPC__V_CMPX_EQ_U16,
-        &Decoder::decode_OP_VOPC__V_CMPX_LE_U16,
-        &Decoder::decode_OP_VOPC__V_CMPX_GT_U16,
-        &Decoder::decode_OP_VOPC__V_CMPX_NE_U16,
-        &Decoder::decode_OP_VOPC__V_CMPX_GE_U16,
-        &Decoder::decode_OP_VOPC__V_CMPX_T_U16,
-        &Decoder::decode_OP_VOPC__V_CMP_F_I32,
-        &Decoder::decode_OP_VOPC__V_CMP_LT_I32,
-        &Decoder::decode_OP_VOPC__V_CMP_EQ_I32,
-        &Decoder::decode_OP_VOPC__V_CMP_LE_I32,
-        &Decoder::decode_OP_VOPC__V_CMP_GT_I32,
-        &Decoder::decode_OP_VOPC__V_CMP_NE_I32,
-        &Decoder::decode_OP_VOPC__V_CMP_GE_I32,
-        &Decoder::decode_OP_VOPC__V_CMP_T_I32,
-        &Decoder::decode_OP_VOPC__V_CMP_F_U32,
-        &Decoder::decode_OP_VOPC__V_CMP_LT_U32,
-        &Decoder::decode_OP_VOPC__V_CMP_EQ_U32,
-        &Decoder::decode_OP_VOPC__V_CMP_LE_U32,
-        &Decoder::decode_OP_VOPC__V_CMP_GT_U32,
-        &Decoder::decode_OP_VOPC__V_CMP_NE_U32,
-        &Decoder::decode_OP_VOPC__V_CMP_GE_U32,
-        &Decoder::decode_OP_VOPC__V_CMP_T_U32,
-        &Decoder::decode_OP_VOPC__V_CMPX_F_I32,
-        &Decoder::decode_OP_VOPC__V_CMPX_LT_I32,
-        &Decoder::decode_OP_VOPC__V_CMPX_EQ_I32,
-        &Decoder::decode_OP_VOPC__V_CMPX_LE_I32,
-        &Decoder::decode_OP_VOPC__V_CMPX_GT_I32,
-        &Decoder::decode_OP_VOPC__V_CMPX_NE_I32,
-        &Decoder::decode_OP_VOPC__V_CMPX_GE_I32,
-        &Decoder::decode_OP_VOPC__V_CMPX_T_I32,
-        &Decoder::decode_OP_VOPC__V_CMPX_F_U32,
-        &Decoder::decode_OP_VOPC__V_CMPX_LT_U32,
-        &Decoder::decode_OP_VOPC__V_CMPX_EQ_U32,
-        &Decoder::decode_OP_VOPC__V_CMPX_LE_U32,
-        &Decoder::decode_OP_VOPC__V_CMPX_GT_U32,
-        &Decoder::decode_OP_VOPC__V_CMPX_NE_U32,
-        &Decoder::decode_OP_VOPC__V_CMPX_GE_U32,
-        &Decoder::decode_OP_VOPC__V_CMPX_T_U32,
-        &Decoder::decode_OP_VOPC__V_CMP_F_I64,
-        &Decoder::decode_OP_VOPC__V_CMP_LT_I64,
-        &Decoder::decode_OP_VOPC__V_CMP_EQ_I64,
-        &Decoder::decode_OP_VOPC__V_CMP_LE_I64,
-        &Decoder::decode_OP_VOPC__V_CMP_GT_I64,
-        &Decoder::decode_OP_VOPC__V_CMP_NE_I64,
-        &Decoder::decode_OP_VOPC__V_CMP_GE_I64,
-        &Decoder::decode_OP_VOPC__V_CMP_T_I64,
-        &Decoder::decode_OP_VOPC__V_CMP_F_U64,
-        &Decoder::decode_OP_VOPC__V_CMP_LT_U64,
-        &Decoder::decode_OP_VOPC__V_CMP_EQ_U64,
-        &Decoder::decode_OP_VOPC__V_CMP_LE_U64,
-        &Decoder::decode_OP_VOPC__V_CMP_GT_U64,
-        &Decoder::decode_OP_VOPC__V_CMP_NE_U64,
-        &Decoder::decode_OP_VOPC__V_CMP_GE_U64,
-        &Decoder::decode_OP_VOPC__V_CMP_T_U64,
-        &Decoder::decode_OP_VOPC__V_CMPX_F_I64,
-        &Decoder::decode_OP_VOPC__V_CMPX_LT_I64,
-        &Decoder::decode_OP_VOPC__V_CMPX_EQ_I64,
-        &Decoder::decode_OP_VOPC__V_CMPX_LE_I64,
-        &Decoder::decode_OP_VOPC__V_CMPX_GT_I64,
-        &Decoder::decode_OP_VOPC__V_CMPX_NE_I64,
-        &Decoder::decode_OP_VOPC__V_CMPX_GE_I64,
-        &Decoder::decode_OP_VOPC__V_CMPX_T_I64,
-        &Decoder::decode_OP_VOPC__V_CMPX_F_U64,
-        &Decoder::decode_OP_VOPC__V_CMPX_LT_U64,
-        &Decoder::decode_OP_VOPC__V_CMPX_EQ_U64,
-        &Decoder::decode_OP_VOPC__V_CMPX_LE_U64,
-        &Decoder::decode_OP_VOPC__V_CMPX_GT_U64,
-        &Decoder::decode_OP_VOPC__V_CMPX_NE_U64,
-        &Decoder::decode_OP_VOPC__V_CMPX_GE_U64,
-        &Decoder::decode_OP_VOPC__V_CMPX_T_U64,
-    };
-
-    GPUStaticInst*
-    Decoder::decode(MachInst mach_inst)
-    {
-        InFmt_SOP1 *enc = &mach_inst->iFmt_SOP1;
-        IsaDecodeMethod method = tableDecodePrimary[enc->ENCODING];
-        return (this->*method)(mach_inst);
-    } // decode
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_VOPC(MachInst iFmt)
-    {
-        InFmt_VOPC *enc = &iFmt->iFmt_VOPC;
-        IsaDecodeMethod method = tableSubDecode_OP_VOPC[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_VOPC
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_VOP1(MachInst iFmt)
-    {
-        InFmt_VOP1 *enc = &iFmt->iFmt_VOP1;
-        IsaDecodeMethod method = tableSubDecode_OP_VOP1[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_VOP1
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_SOP1(MachInst iFmt)
-    {
-        InFmt_SOP1 *enc = &iFmt->iFmt_SOP1;
-        IsaDecodeMethod method = tableSubDecode_OP_SOP1[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_SOP1
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_SOPC(MachInst iFmt)
-    {
-        InFmt_SOPC *enc = &iFmt->iFmt_SOPC;
-        IsaDecodeMethod method = tableSubDecode_OP_SOPC[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_SOPC
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_SOPP(MachInst iFmt)
-    {
-        InFmt_SOPP *enc = &iFmt->iFmt_SOPP;
-        IsaDecodeMethod method = tableSubDecode_OP_SOPP[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_SOPP
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_SMEM(MachInst iFmt)
-    {
-        InFmt_SMEM *enc = &iFmt->iFmt_SMEM;
-        IsaDecodeMethod method = tableSubDecode_OP_SMEM[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_SMEM
-
-    GPUStaticInst*
-    Decoder::subDecode_OPU_VOP3(MachInst iFmt)
-    {
-        InFmt_VOP3 *enc = &iFmt->iFmt_VOP3;
-        IsaDecodeMethod method = tableSubDecode_OPU_VOP3[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OPU_VOP3
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_VINTRP(MachInst iFmt)
-    {
-        InFmt_VINTRP *enc = &iFmt->iFmt_VINTRP;
-        IsaDecodeMethod method = tableSubDecode_OP_VINTRP[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_VINTRP
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_DS(MachInst iFmt)
-    {
-        InFmt_DS *enc = &iFmt->iFmt_DS;
-        IsaDecodeMethod method = tableSubDecode_OP_DS[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_DS
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_FLAT(MachInst iFmt)
-    {
-        InFmt_FLAT *enc = &iFmt->iFmt_FLAT;
-        IsaDecodeMethod method = tableSubDecode_OP_FLAT[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_FLAT
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_MUBUF(MachInst iFmt)
-    {
-        InFmt_MUBUF *enc = &iFmt->iFmt_MUBUF;
-        IsaDecodeMethod method = tableSubDecode_OP_MUBUF[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_MUBUF
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_MTBUF(MachInst iFmt)
-    {
-        InFmt_MTBUF *enc = &iFmt->iFmt_MTBUF;
-        IsaDecodeMethod method = tableSubDecode_OP_MTBUF[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_MTBUF
-
-    GPUStaticInst*
-    Decoder::subDecode_OP_MIMG(MachInst iFmt)
-    {
-        InFmt_MIMG *enc = &iFmt->iFmt_MIMG;
-        IsaDecodeMethod method = tableSubDecode_OP_MIMG[enc->OP];
-        return (this->*method)(iFmt);
-    } // subDecode_OP_MIMG
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_CNDMASK_B32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_CNDMASK_B32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_CNDMASK_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_ADD_F32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_ADD_F32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_ADD_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_SUB_F32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_SUB_F32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_SUB_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_SUBREV_F32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_SUBREV_F32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_SUBREV_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MUL_LEGACY_F32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MUL_LEGACY_F32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MUL_LEGACY_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MUL_F32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MUL_F32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MUL_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MUL_I32_I24(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MUL_I32_I24(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MUL_I32_I24
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MUL_HI_I32_I24(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MUL_HI_I32_I24(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MUL_HI_I32_I24
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MUL_U32_U24(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MUL_U32_U24(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MUL_U32_U24
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MUL_HI_U32_U24(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MUL_HI_U32_U24(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MUL_HI_U32_U24
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MIN_F32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MIN_F32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MIN_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MAX_F32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MAX_F32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MAX_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MIN_I32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MIN_I32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MIN_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MAX_I32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MAX_I32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MAX_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MIN_U32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MIN_U32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MIN_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MAX_U32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MAX_U32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MAX_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_LSHRREV_B32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_LSHRREV_B32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_LSHRREV_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_ASHRREV_I32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_ASHRREV_I32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_ASHRREV_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_LSHLREV_B32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_LSHLREV_B32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_LSHLREV_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_AND_B32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_AND_B32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_AND_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_OR_B32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_OR_B32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_OR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_XOR_B32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_XOR_B32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_XOR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MAC_F32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MAC_F32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MAC_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MADMK_F32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MADMK_F32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MADMK_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MADAK_F32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MADAK_F32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MADAK_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_ADD_U32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_ADD_U32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_ADD_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_SUB_U32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_SUB_U32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_SUB_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_SUBREV_U32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_SUBREV_U32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_SUBREV_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_ADDC_U32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_ADDC_U32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_ADDC_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_SUBB_U32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_SUBB_U32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_SUBB_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_SUBBREV_U32(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_SUBBREV_U32(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_SUBBREV_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_ADD_F16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_ADD_F16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_ADD_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_SUB_F16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_SUB_F16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_SUB_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_SUBREV_F16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_SUBREV_F16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_SUBREV_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MUL_F16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MUL_F16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MUL_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MAC_F16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MAC_F16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MAC_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MADMK_F16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MADMK_F16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MADMK_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MADAK_F16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MADAK_F16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MADAK_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_ADD_U16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_ADD_U16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_ADD_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_SUB_U16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_SUB_U16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_SUB_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_SUBREV_U16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_SUBREV_U16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_SUBREV_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MUL_LO_U16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MUL_LO_U16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MUL_LO_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_LSHLREV_B16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_LSHLREV_B16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_LSHLREV_B16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_LSHRREV_B16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_LSHRREV_B16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_LSHRREV_B16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_ASHRREV_I16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_ASHRREV_I16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_ASHRREV_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MAX_F16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MAX_F16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MAX_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MIN_F16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MIN_F16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MIN_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MAX_U16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MAX_U16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MAX_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MAX_I16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MAX_I16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MAX_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MIN_U16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MIN_U16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MIN_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_MIN_I16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_MIN_I16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_MIN_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP2__V_LDEXP_F16(MachInst iFmt)
-    {
-        return new Inst_VOP2__V_LDEXP_F16(&iFmt->iFmt_VOP2);
-    } // decode_OP_VOP2__V_LDEXP_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_ADD_U32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_ADD_U32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_ADD_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_SUB_U32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_SUB_U32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_SUB_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_ADD_I32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_ADD_I32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_ADD_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_SUB_I32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_SUB_I32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_SUB_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_ADDC_U32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_ADDC_U32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_ADDC_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_SUBB_U32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_SUBB_U32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_SUBB_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_MIN_I32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_MIN_I32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_MIN_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_MIN_U32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_MIN_U32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_MIN_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_MAX_I32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_MAX_I32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_MAX_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_MAX_U32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_MAX_U32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_MAX_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_CSELECT_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_CSELECT_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_CSELECT_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_CSELECT_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_CSELECT_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_CSELECT_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_AND_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_AND_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_AND_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_AND_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_AND_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_AND_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_OR_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_OR_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_OR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_OR_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_OR_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_OR_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_XOR_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_XOR_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_XOR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_XOR_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_XOR_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_XOR_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_ANDN2_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_ANDN2_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_ANDN2_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_ANDN2_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_ANDN2_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_ANDN2_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_ORN2_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_ORN2_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_ORN2_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_ORN2_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_ORN2_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_ORN2_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_NAND_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_NAND_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_NAND_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_NAND_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_NAND_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_NAND_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_NOR_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_NOR_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_NOR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_NOR_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_NOR_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_NOR_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_XNOR_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_XNOR_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_XNOR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_XNOR_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_XNOR_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_XNOR_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_LSHL_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_LSHL_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_LSHL_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_LSHL_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_LSHL_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_LSHL_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_LSHR_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_LSHR_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_LSHR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_LSHR_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_LSHR_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_LSHR_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_ASHR_I32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_ASHR_I32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_ASHR_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_ASHR_I64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_ASHR_I64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_ASHR_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_BFM_B32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_BFM_B32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_BFM_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_BFM_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_BFM_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_BFM_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_MUL_I32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_MUL_I32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_MUL_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_BFE_U32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_BFE_U32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_BFE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_BFE_I32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_BFE_I32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_BFE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_BFE_U64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_BFE_U64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_BFE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_BFE_I64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_BFE_I64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_BFE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_CBRANCH_G_FORK(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_CBRANCH_G_FORK(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_CBRANCH_G_FORK
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_ABSDIFF_I32(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_ABSDIFF_I32(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_ABSDIFF_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP2__S_RFE_RESTORE_B64(MachInst iFmt)
-    {
-        return new Inst_SOP2__S_RFE_RESTORE_B64(&iFmt->iFmt_SOP2);
-    } // decode_OP_SOP2__S_RFE_RESTORE_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_MOVK_I32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_MOVK_I32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_MOVK_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMOVK_I32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMOVK_I32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMOVK_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_EQ_I32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_EQ_I32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_EQ_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_LG_I32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_LG_I32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_LG_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_GT_I32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_GT_I32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_GT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_GE_I32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_GE_I32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_GE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_LT_I32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_LT_I32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_LT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_LE_I32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_LE_I32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_LE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_EQ_U32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_EQ_U32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_EQ_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_LG_U32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_LG_U32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_LG_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_GT_U32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_GT_U32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_GT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_GE_U32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_GE_U32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_GE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_LT_U32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_LT_U32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_LT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CMPK_LE_U32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CMPK_LE_U32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CMPK_LE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_ADDK_I32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_ADDK_I32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_ADDK_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_MULK_I32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_MULK_I32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_MULK_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_CBRANCH_I_FORK(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_CBRANCH_I_FORK(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_CBRANCH_I_FORK
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_GETREG_B32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_GETREG_B32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_GETREG_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_SETREG_B32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_SETREG_B32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_SETREG_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPK__S_SETREG_IMM32_B32(MachInst iFmt)
-    {
-        return new Inst_SOPK__S_SETREG_IMM32_B32(&iFmt->iFmt_SOPK);
-    } // decode_OP_SOPK__S_SETREG_IMM32_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_EXP(MachInst iFmt)
-    {
-        return new Inst_EXP__EXP(&iFmt->iFmt_EXP);
-    } // decode_OP_EXP
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_CLASS_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_CLASS_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_CLASS_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_CLASS_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_CLASS_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_CLASS_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_CLASS_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_CLASS_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_CLASS_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_CLASS_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_CLASS_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_CLASS_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_CLASS_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_CLASS_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_CLASS_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_CLASS_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_CLASS_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_CLASS_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_F_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_F_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_F_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LT_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_EQ_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_EQ_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_EQ_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LE_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LE_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GT_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LG_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LG_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LG_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GE_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GE_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_O_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_O_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_O_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_U_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_U_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_U_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NGE_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NGE_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NGE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NLG_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NLG_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NLG_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NGT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NGT_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NGT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NLE_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NLE_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NLE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NEQ_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NEQ_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NEQ_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NLT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NLT_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NLT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_TRU_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_TRU_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_TRU_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_F_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_F_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_F_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LT_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_EQ_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_EQ_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_EQ_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LE_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LE_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GT_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LG_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LG_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LG_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GE_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GE_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_O_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_O_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_O_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_U_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_U_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_U_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NGE_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NGE_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NGE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NLG_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NLG_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NLG_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NGT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NGT_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NGT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NLE_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NLE_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NLE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NEQ_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NEQ_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NEQ_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NLT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NLT_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NLT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_TRU_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_TRU_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_TRU_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_F_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_F_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_F_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LT_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_EQ_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_EQ_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_EQ_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LE_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LE_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GT_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LG_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LG_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GE_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GE_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_O_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_O_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_O_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_U_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_U_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_U_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NGE_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NGE_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NGE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NLG_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NLG_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NLG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NGT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NGT_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NGT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NLE_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NLE_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NLE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NEQ_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NEQ_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NEQ_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NLT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NLT_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NLT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_TRU_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_TRU_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_TRU_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_F_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_F_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_F_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LT_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_EQ_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_EQ_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_EQ_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LE_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LE_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GT_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LG_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LG_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GE_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GE_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_O_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_O_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_O_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_U_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_U_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_U_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NGE_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NGE_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NGE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NLG_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NLG_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NLG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NGT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NGT_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NGT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NLE_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NLE_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NLE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NEQ_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NEQ_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NEQ_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NLT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NLT_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NLT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_TRU_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_TRU_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_TRU_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_F_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_F_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_F_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LT_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_EQ_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_EQ_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_EQ_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LE_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LE_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GT_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LG_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LG_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LG_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GE_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GE_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_O_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_O_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_O_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_U_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_U_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_U_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NGE_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NGE_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NGE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NLG_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NLG_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NLG_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NGT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NGT_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NGT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NLE_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NLE_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NLE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NEQ_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NEQ_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NEQ_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NLT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NLT_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NLT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_TRU_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_TRU_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_TRU_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_F_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_F_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_F_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LT_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_EQ_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_EQ_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_EQ_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LE_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LE_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GT_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LG_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LG_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LG_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GE_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GE_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_O_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_O_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_O_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_U_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_U_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_U_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NGE_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NGE_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NGE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NLG_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NLG_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NLG_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NGT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NGT_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NGT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NLE_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NLE_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NLE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NEQ_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NEQ_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NEQ_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NLT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NLT_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NLT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_TRU_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_TRU_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_TRU_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_F_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_F_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_F_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LT_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LT_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LT_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_EQ_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_EQ_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_EQ_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LE_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LE_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GT_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GT_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GT_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NE_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NE_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GE_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GE_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_T_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_T_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_T_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_F_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_F_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_F_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LT_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LT_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LT_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_EQ_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_EQ_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_EQ_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LE_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LE_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GT_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GT_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GT_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NE_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NE_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GE_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GE_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_T_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_T_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_T_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_F_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_F_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_F_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LT_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LT_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LT_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_EQ_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_EQ_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_EQ_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LE_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LE_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GT_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GT_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GT_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NE_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NE_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GE_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GE_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_T_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_T_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_T_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_F_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_F_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_F_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LT_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LT_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LT_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_EQ_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_EQ_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_EQ_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LE_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LE_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GT_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GT_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GT_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NE_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NE_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GE_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GE_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_T_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_T_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_T_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_F_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_F_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_F_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LT_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LT_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_EQ_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_EQ_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_EQ_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LE_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LE_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GT_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GT_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NE_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NE_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GE_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GE_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_T_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_T_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_T_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_F_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_F_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_F_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LT_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LT_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_EQ_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_EQ_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_EQ_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LE_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LE_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GT_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GT_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NE_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NE_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GE_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GE_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_T_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_T_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_T_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_F_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_F_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_F_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LT_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LT_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_EQ_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_EQ_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_EQ_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LE_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LE_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GT_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GT_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NE_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NE_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GE_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GE_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_T_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_T_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_T_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_F_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_F_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_F_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LT_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LT_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_EQ_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_EQ_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_EQ_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LE_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LE_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GT_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GT_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NE_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NE_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GE_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GE_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_T_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_T_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_T_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_F_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_F_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_F_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LT_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LT_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LT_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_EQ_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_EQ_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_EQ_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LE_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LE_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GT_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GT_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GT_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NE_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NE_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GE_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GE_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_T_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_T_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_T_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_F_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_F_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_F_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LT_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LT_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LT_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_EQ_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_EQ_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_EQ_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_LE_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_LE_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_LE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GT_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GT_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GT_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_NE_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_NE_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_NE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_GE_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_GE_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_GE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMP_T_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMP_T_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMP_T_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_F_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_F_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_F_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LT_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LT_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LT_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_EQ_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_EQ_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_EQ_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LE_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LE_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GT_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GT_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GT_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NE_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NE_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GE_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GE_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_T_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_T_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_T_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_F_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_F_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_F_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LT_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LT_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LT_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_EQ_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_EQ_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_EQ_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_LE_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_LE_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_LE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GT_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GT_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GT_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_NE_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_NE_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_NE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_GE_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_GE_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_GE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CMPX_T_U64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CMPX_T_U64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CMPX_T_U64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CNDMASK_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CNDMASK_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CNDMASK_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ADD_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_ADD_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_ADD_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUB_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SUB_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SUB_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUBREV_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SUBREV_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SUBREV_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_LEGACY_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_LEGACY_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_LEGACY_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_I32_I24(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_I32_I24(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_I32_I24
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_HI_I32_I24(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_HI_I32_I24(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_HI_I32_I24
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_U32_U24(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_U32_U24(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_U32_U24
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_HI_U32_U24(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_HI_U32_U24(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_HI_U32_U24
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MIN_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MIN_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MIN_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAX_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAX_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAX_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MIN_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MIN_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MIN_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAX_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAX_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAX_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MIN_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MIN_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MIN_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAX_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAX_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAX_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LSHRREV_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LSHRREV_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LSHRREV_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ASHRREV_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_ASHRREV_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_ASHRREV_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LSHLREV_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LSHLREV_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LSHLREV_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_AND_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_AND_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_AND_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_OR_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_OR_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_OR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_XOR_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_XOR_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_XOR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAC_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAC_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAC_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ADD_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_ADD_U32(&iFmt->iFmt_VOP3_SDST_ENC);
-    } // decode_OPU_VOP3__V_ADD_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUB_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SUB_U32(&iFmt->iFmt_VOP3_SDST_ENC);
-    } // decode_OPU_VOP3__V_SUB_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUBREV_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SUBREV_U32(&iFmt->iFmt_VOP3_SDST_ENC);
-    } // decode_OPU_VOP3__V_SUBREV_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ADDC_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_ADDC_U32(&iFmt->iFmt_VOP3_SDST_ENC);
-    } // decode_OPU_VOP3__V_ADDC_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUBB_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SUBB_U32(&iFmt->iFmt_VOP3_SDST_ENC);
-    } // decode_OPU_VOP3__V_SUBB_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUBBREV_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SUBBREV_U32(&iFmt->iFmt_VOP3_SDST_ENC);
-    } // decode_OPU_VOP3__V_SUBBREV_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ADD_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_ADD_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_ADD_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUB_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SUB_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SUB_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUBREV_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SUBREV_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SUBREV_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAC_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAC_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAC_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ADD_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_ADD_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_ADD_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUB_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SUB_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SUB_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUBREV_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SUBREV_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SUBREV_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_LO_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_LO_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_LO_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LSHLREV_B16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LSHLREV_B16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LSHLREV_B16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LSHRREV_B16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LSHRREV_B16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LSHRREV_B16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ASHRREV_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_ASHRREV_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_ASHRREV_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAX_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAX_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAX_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MIN_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MIN_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MIN_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAX_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAX_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAX_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAX_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAX_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAX_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MIN_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MIN_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MIN_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MIN_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MIN_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MIN_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LDEXP_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LDEXP_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LDEXP_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_NOP(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_NOP(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_NOP
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MOV_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MOV_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MOV_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_I32_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_I32_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_I32_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F64_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F64_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F64_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F32_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F32_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F32_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F32_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F32_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F32_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_U32_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_U32_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_U32_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_I32_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_I32_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_I32_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MOV_FED_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MOV_FED_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MOV_FED_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F16_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F16_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F16_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F32_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F32_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F32_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_RPI_I32_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_RPI_I32_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_RPI_I32_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_FLR_I32_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_FLR_I32_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_FLR_I32_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_OFF_F32_I4(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_OFF_F32_I4(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_OFF_F32_I4
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F32_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F32_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F32_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F64_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F64_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F64_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F32_UBYTE0(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F32_UBYTE0(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F32_UBYTE0
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F32_UBYTE1(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F32_UBYTE1(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F32_UBYTE1
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F32_UBYTE2(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F32_UBYTE2(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F32_UBYTE2
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F32_UBYTE3(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F32_UBYTE3(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F32_UBYTE3
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_U32_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_U32_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_U32_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F64_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F64_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F64_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_TRUNC_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_TRUNC_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_TRUNC_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CEIL_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CEIL_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CEIL_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_RNDNE_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_RNDNE_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_RNDNE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FLOOR_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FLOOR_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FLOOR_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FRACT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FRACT_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FRACT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_TRUNC_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_TRUNC_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_TRUNC_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CEIL_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CEIL_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CEIL_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_RNDNE_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_RNDNE_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_RNDNE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FLOOR_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FLOOR_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FLOOR_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_EXP_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_EXP_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_EXP_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LOG_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LOG_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LOG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_RCP_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_RCP_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_RCP_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_RCP_IFLAG_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_RCP_IFLAG_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_RCP_IFLAG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_RSQ_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_RSQ_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_RSQ_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_RCP_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_RCP_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_RCP_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_RSQ_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_RSQ_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_RSQ_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SQRT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SQRT_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SQRT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SQRT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SQRT_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SQRT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SIN_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SIN_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SIN_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_COS_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_COS_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_COS_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_NOT_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_NOT_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_NOT_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_BFREV_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_BFREV_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_BFREV_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FFBH_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FFBH_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FFBH_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FFBL_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FFBL_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FFBL_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FFBH_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FFBH_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FFBH_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FREXP_EXP_I32_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FREXP_EXP_I32_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FREXP_EXP_I32_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FREXP_MANT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FREXP_MANT_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FREXP_MANT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FRACT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FRACT_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FRACT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FREXP_EXP_I32_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FREXP_EXP_I32_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FREXP_EXP_I32_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FREXP_MANT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FREXP_MANT_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FREXP_MANT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CLREXCP(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CLREXCP(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CLREXCP
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F16_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F16_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F16_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_F16_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_F16_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_F16_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_U16_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_U16_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_U16_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_I16_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_I16_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_I16_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_RCP_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_RCP_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_RCP_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SQRT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SQRT_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SQRT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_RSQ_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_RSQ_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_RSQ_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LOG_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LOG_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LOG_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_EXP_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_EXP_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_EXP_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FREXP_MANT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FREXP_MANT_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FREXP_MANT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FREXP_EXP_I16_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FREXP_EXP_I16_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FREXP_EXP_I16_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FLOOR_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FLOOR_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FLOOR_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CEIL_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CEIL_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CEIL_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_TRUNC_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_TRUNC_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_TRUNC_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_RNDNE_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_RNDNE_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_RNDNE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FRACT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FRACT_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FRACT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SIN_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SIN_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SIN_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_COS_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_COS_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_COS_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_EXP_LEGACY_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_EXP_LEGACY_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_EXP_LEGACY_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LOG_LEGACY_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LOG_LEGACY_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LOG_LEGACY_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAD_LEGACY_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAD_LEGACY_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAD_LEGACY_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAD_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAD_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAD_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAD_I32_I24(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAD_I32_I24(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAD_I32_I24
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAD_U32_U24(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAD_U32_U24(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAD_U32_U24
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CUBEID_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CUBEID_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CUBEID_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CUBESC_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CUBESC_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CUBESC_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CUBETC_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CUBETC_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CUBETC_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CUBEMA_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CUBEMA_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CUBEMA_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_BFE_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_BFE_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_BFE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_BFE_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_BFE_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_BFE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_BFI_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_BFI_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_BFI_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FMA_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FMA_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FMA_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FMA_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FMA_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FMA_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LERP_U8(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LERP_U8(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LERP_U8
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ALIGNBIT_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_ALIGNBIT_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_ALIGNBIT_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ALIGNBYTE_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_ALIGNBYTE_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_ALIGNBYTE_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MIN3_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MIN3_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MIN3_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MIN3_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MIN3_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MIN3_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MIN3_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MIN3_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MIN3_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAX3_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAX3_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAX3_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAX3_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAX3_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAX3_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAX3_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAX3_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAX3_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MED3_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MED3_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MED3_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MED3_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MED3_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MED3_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MED3_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MED3_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MED3_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SAD_U8(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SAD_U8(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SAD_U8
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SAD_HI_U8(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SAD_HI_U8(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SAD_HI_U8
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SAD_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SAD_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SAD_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SAD_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_SAD_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_SAD_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_PK_U8_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_PK_U8_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_PK_U8_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_DIV_FIXUP_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_DIV_FIXUP_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_DIV_FIXUP_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_DIV_FIXUP_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_DIV_SCALE_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_DIV_SCALE_F32(&iFmt->iFmt_VOP3_SDST_ENC);
-    } // decode_OPU_VOP3__V_DIV_SCALE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_DIV_SCALE_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_DIV_SCALE_F64(&iFmt->iFmt_VOP3_SDST_ENC);
-    } // decode_OPU_VOP3__V_DIV_SCALE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_DIV_FMAS_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_DIV_FMAS_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_DIV_FMAS_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_DIV_FMAS_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_DIV_FMAS_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_DIV_FMAS_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MSAD_U8(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MSAD_U8(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MSAD_U8
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_QSAD_PK_U16_U8(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_QSAD_PK_U16_U8(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_QSAD_PK_U16_U8
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MQSAD_PK_U16_U8(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MQSAD_PK_U16_U8(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MQSAD_PK_U16_U8
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MQSAD_U32_U8(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MQSAD_U32_U8(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MQSAD_U32_U8
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAD_U64_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAD_U64_U32(&iFmt->iFmt_VOP3_SDST_ENC);
-    } // decode_OPU_VOP3__V_MAD_U64_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAD_I64_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAD_I64_I32(&iFmt->iFmt_VOP3_SDST_ENC);
-    } // decode_OPU_VOP3__V_MAD_I64_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAD_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAD_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAD_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAD_U16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAD_U16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAD_U16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAD_I16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAD_I16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAD_I16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_PERM_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_PERM_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_PERM_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_FMA_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_FMA_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_FMA_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_DIV_FIXUP_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_DIV_FIXUP_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_PKACCUM_U8_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_PKACCUM_U8_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_PKACCUM_U8_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_INTERP_P1_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_INTERP_P1_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_INTERP_P1_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_INTERP_P2_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_INTERP_P2_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_INTERP_P2_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_INTERP_MOV_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_INTERP_MOV_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_INTERP_P1LL_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_INTERP_P1LL_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_INTERP_P1LL_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_INTERP_P1LV_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_INTERP_P1LV_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_INTERP_P1LV_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_INTERP_P2_F16(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_INTERP_P2_F16(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_INTERP_P2_F16
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ADD_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_ADD_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_ADD_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MIN_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MIN_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MIN_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MAX_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MAX_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MAX_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LDEXP_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LDEXP_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LDEXP_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_LO_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_LO_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_LO_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_HI_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_HI_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_HI_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MUL_HI_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MUL_HI_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MUL_HI_I32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LDEXP_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LDEXP_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LDEXP_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_READLANE_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_READLANE_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_READLANE_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_WRITELANE_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_WRITELANE_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_WRITELANE_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_BCNT_U32_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_BCNT_U32_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_BCNT_U32_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MBCNT_LO_U32_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MBCNT_LO_U32_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MBCNT_LO_U32_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_MBCNT_HI_U32_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_MBCNT_HI_U32_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_MBCNT_HI_U32_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LSHLREV_B64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LSHLREV_B64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LSHLREV_B64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_LSHRREV_B64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_LSHRREV_B64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_LSHRREV_B64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ASHRREV_I64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_ASHRREV_I64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_ASHRREV_I64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_TRIG_PREOP_F64(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_TRIG_PREOP_F64(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_TRIG_PREOP_F64
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_BFM_B32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_BFM_B32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_BFM_B32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_PKNORM_I16_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_PKNORM_I16_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_PKNORM_I16_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_PKNORM_U16_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_PKNORM_U16_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_PKNORM_U16_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_PKRTZ_F16_F32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_PKRTZ_F16_F32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_PKRTZ_F16_F32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_PK_U16_U32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_PK_U16_U32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_PK_U16_U32
-
-    GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_CVT_PK_I16_I32(MachInst iFmt)
-    {
-        return new Inst_VOP3__V_CVT_PK_I16_I32(&iFmt->iFmt_VOP3);
-    } // decode_OPU_VOP3__V_CVT_PK_I16_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_ADD_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_ADD_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_ADD_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_SUB_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_SUB_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_SUB_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_RSUB_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_RSUB_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_RSUB_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_INC_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_INC_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_INC_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_DEC_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_DEC_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_DEC_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_I32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_I32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_I32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_I32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_AND_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_AND_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_AND_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_OR_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_OR_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_OR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_XOR_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_XOR_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_XOR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MSKOR_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MSKOR_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MSKOR_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE2_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE2_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE2_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE2ST64_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE2ST64_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE2ST64_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_CMPST_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_CMPST_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_CMPST_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_CMPST_F32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_CMPST_F32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_CMPST_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_F32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_F32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_F32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_F32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_NOP(MachInst iFmt)
-    {
-        return new Inst_DS__DS_NOP(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_NOP
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_ADD_F32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_ADD_F32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_ADD_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE_B8(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE_B8(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE_B8
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE_B16(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE_B16(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE_B16
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_ADD_RTN_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_ADD_RTN_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_ADD_RTN_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_SUB_RTN_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_SUB_RTN_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_SUB_RTN_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_RSUB_RTN_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_RSUB_RTN_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_RSUB_RTN_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_INC_RTN_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_INC_RTN_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_INC_RTN_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_DEC_RTN_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_DEC_RTN_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_DEC_RTN_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_RTN_I32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_RTN_I32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_RTN_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_RTN_I32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_RTN_I32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_RTN_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_RTN_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_RTN_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_RTN_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_RTN_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_RTN_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_RTN_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_AND_RTN_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_AND_RTN_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_AND_RTN_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_OR_RTN_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_OR_RTN_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_OR_RTN_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_XOR_RTN_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_XOR_RTN_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_XOR_RTN_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MSKOR_RTN_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MSKOR_RTN_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MSKOR_RTN_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRXCHG_RTN_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRXCHG_RTN_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRXCHG_RTN_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRXCHG2_RTN_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRXCHG2_RTN_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRXCHG2_RTN_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRXCHG2ST64_RTN_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRXCHG2ST64_RTN_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRXCHG2ST64_RTN_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_CMPST_RTN_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_CMPST_RTN_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_CMPST_RTN_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_CMPST_RTN_F32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_CMPST_RTN_F32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_CMPST_RTN_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_RTN_F32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_RTN_F32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_RTN_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_RTN_F32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_RTN_F32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_RTN_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRAP_RTN_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRAP_RTN_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRAP_RTN_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_ADD_RTN_F32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_ADD_RTN_F32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_ADD_RTN_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ2_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ2_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ2_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ2ST64_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ2ST64_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ2ST64_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ_I8(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ_I8(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ_I8
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ_U8(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ_U8(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ_U8
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ_I16(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ_I16(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ_U16(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ_U16(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_SWIZZLE_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_SWIZZLE_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_SWIZZLE_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_PERMUTE_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_PERMUTE_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_PERMUTE_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_BPERMUTE_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_BPERMUTE_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_BPERMUTE_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_ADD_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_ADD_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_ADD_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_SUB_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_SUB_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_SUB_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_RSUB_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_RSUB_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_RSUB_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_INC_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_INC_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_INC_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_DEC_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_DEC_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_DEC_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_I64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_I64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_I64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_I64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_AND_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_AND_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_AND_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_OR_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_OR_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_OR_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_XOR_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_XOR_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_XOR_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MSKOR_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MSKOR_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MSKOR_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE2_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE2_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE2_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE2ST64_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE2ST64_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE2ST64_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_CMPST_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_CMPST_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_CMPST_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_CMPST_F64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_CMPST_F64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_CMPST_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_F64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_F64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_F64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_F64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_ADD_RTN_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_ADD_RTN_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_ADD_RTN_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_SUB_RTN_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_SUB_RTN_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_SUB_RTN_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_RSUB_RTN_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_RSUB_RTN_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_RSUB_RTN_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_INC_RTN_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_INC_RTN_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_INC_RTN_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_DEC_RTN_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_DEC_RTN_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_DEC_RTN_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_RTN_I64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_RTN_I64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_RTN_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_RTN_I64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_RTN_I64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_RTN_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_RTN_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_RTN_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_RTN_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_RTN_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_RTN_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_RTN_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_AND_RTN_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_AND_RTN_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_AND_RTN_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_OR_RTN_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_OR_RTN_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_OR_RTN_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_XOR_RTN_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_XOR_RTN_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_XOR_RTN_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MSKOR_RTN_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MSKOR_RTN_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MSKOR_RTN_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRXCHG_RTN_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRXCHG_RTN_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRXCHG_RTN_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRXCHG2_RTN_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRXCHG2_RTN_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRXCHG2_RTN_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRXCHG2ST64_RTN_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRXCHG2ST64_RTN_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRXCHG2ST64_RTN_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_CMPST_RTN_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_CMPST_RTN_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_CMPST_RTN_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_CMPST_RTN_F64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_CMPST_RTN_F64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_CMPST_RTN_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_RTN_F64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_RTN_F64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_RTN_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_RTN_F64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_RTN_F64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_RTN_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ2_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ2_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ2_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ2ST64_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ2ST64_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ2ST64_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_CONDXCHG32_RTN_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_CONDXCHG32_RTN_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_CONDXCHG32_RTN_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_ADD_SRC2_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_ADD_SRC2_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_ADD_SRC2_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_SUB_SRC2_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_SUB_SRC2_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_SUB_SRC2_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_RSUB_SRC2_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_RSUB_SRC2_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_RSUB_SRC2_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_INC_SRC2_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_INC_SRC2_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_INC_SRC2_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_DEC_SRC2_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_DEC_SRC2_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_DEC_SRC2_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_SRC2_I32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_SRC2_I32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_SRC2_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_SRC2_I32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_SRC2_I32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_SRC2_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_SRC2_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_SRC2_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_SRC2_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_SRC2_U32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_SRC2_U32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_SRC2_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_AND_SRC2_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_AND_SRC2_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_AND_SRC2_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_OR_SRC2_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_OR_SRC2_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_OR_SRC2_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_XOR_SRC2_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_XOR_SRC2_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_XOR_SRC2_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE_SRC2_B32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE_SRC2_B32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE_SRC2_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_SRC2_F32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_SRC2_F32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_SRC2_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_SRC2_F32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_SRC2_F32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_SRC2_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_ADD_SRC2_F32(MachInst iFmt)
-    {
-        return new Inst_DS__DS_ADD_SRC2_F32(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_ADD_SRC2_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_GWS_SEMA_RELEASE_ALL(MachInst iFmt)
-    {
-        return new Inst_DS__DS_GWS_SEMA_RELEASE_ALL(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_GWS_SEMA_RELEASE_ALL
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_GWS_INIT(MachInst iFmt)
-    {
-        return new Inst_DS__DS_GWS_INIT(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_GWS_INIT
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_GWS_SEMA_V(MachInst iFmt)
-    {
-        return new Inst_DS__DS_GWS_SEMA_V(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_GWS_SEMA_V
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_GWS_SEMA_BR(MachInst iFmt)
-    {
-        return new Inst_DS__DS_GWS_SEMA_BR(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_GWS_SEMA_BR
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_GWS_SEMA_P(MachInst iFmt)
-    {
-        return new Inst_DS__DS_GWS_SEMA_P(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_GWS_SEMA_P
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_GWS_BARRIER(MachInst iFmt)
-    {
-        return new Inst_DS__DS_GWS_BARRIER(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_GWS_BARRIER
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_CONSUME(MachInst iFmt)
-    {
-        return new Inst_DS__DS_CONSUME(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_CONSUME
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_APPEND(MachInst iFmt)
-    {
-        return new Inst_DS__DS_APPEND(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_APPEND
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_ORDERED_COUNT(MachInst iFmt)
-    {
-        return new Inst_DS__DS_ORDERED_COUNT(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_ORDERED_COUNT
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_ADD_SRC2_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_ADD_SRC2_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_ADD_SRC2_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_SUB_SRC2_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_SUB_SRC2_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_SUB_SRC2_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_RSUB_SRC2_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_RSUB_SRC2_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_RSUB_SRC2_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_INC_SRC2_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_INC_SRC2_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_INC_SRC2_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_DEC_SRC2_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_DEC_SRC2_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_DEC_SRC2_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_SRC2_I64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_SRC2_I64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_SRC2_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_SRC2_I64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_SRC2_I64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_SRC2_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_SRC2_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_SRC2_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_SRC2_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_SRC2_U64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_SRC2_U64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_SRC2_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_AND_SRC2_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_AND_SRC2_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_AND_SRC2_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_OR_SRC2_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_OR_SRC2_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_OR_SRC2_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_XOR_SRC2_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_XOR_SRC2_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_XOR_SRC2_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE_SRC2_B64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE_SRC2_B64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE_SRC2_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MIN_SRC2_F64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MIN_SRC2_F64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MIN_SRC2_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_MAX_SRC2_F64(MachInst iFmt)
-    {
-        return new Inst_DS__DS_MAX_SRC2_F64(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_MAX_SRC2_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE_B96(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE_B96(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE_B96
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_WRITE_B128(MachInst iFmt)
-    {
-        return new Inst_DS__DS_WRITE_B128(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_WRITE_B128
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ_B96(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ_B96(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ_B96
-
-    GPUStaticInst*
-    Decoder::decode_OP_DS__DS_READ_B128(MachInst iFmt)
-    {
-        return new Inst_DS__DS_READ_B128(&iFmt->iFmt_DS);
-    } // decode_OP_DS__DS_READ_B128
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_LOAD_UBYTE(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_LOAD_UBYTE(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_LOAD_UBYTE
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_LOAD_SBYTE(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_LOAD_SBYTE(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_LOAD_SBYTE
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_LOAD_USHORT(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_LOAD_USHORT(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_LOAD_USHORT
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_LOAD_SSHORT(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_LOAD_SSHORT(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_LOAD_SSHORT
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_LOAD_DWORD(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_LOAD_DWORD(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_LOAD_DWORD
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_LOAD_DWORDX2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_LOAD_DWORDX2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_LOAD_DWORDX2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_LOAD_DWORDX3(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_LOAD_DWORDX3(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_LOAD_DWORDX3
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_LOAD_DWORDX4(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_LOAD_DWORDX4(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_LOAD_DWORDX4
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_STORE_BYTE(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_STORE_BYTE(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_STORE_BYTE
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_STORE_SHORT(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_STORE_SHORT(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_STORE_SHORT
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_STORE_DWORD(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_STORE_DWORD
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_STORE_DWORDX2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_STORE_DWORDX2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_STORE_DWORDX2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_STORE_DWORDX3(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_STORE_DWORDX3(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_STORE_DWORDX3
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_STORE_DWORDX4(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_STORE_DWORDX4(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_STORE_DWORDX4
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_SWAP(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_SWAP(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_SWAP
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_CMPSWAP(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_ADD(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_ADD(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_ADD
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_SUB(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_SUB(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_SUB
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_SMIN(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_SMIN(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_SMIN
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_UMIN(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_UMIN(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_UMIN
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_SMAX(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_SMAX(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_SMAX
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_UMAX(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_UMAX(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_UMAX
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_AND(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_AND(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_AND
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_OR(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_OR(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_OR
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_XOR(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_XOR(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_XOR
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_INC(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_INC(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_INC
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_DEC(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_DEC(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_DEC
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_SWAP_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_SWAP_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_SWAP_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_ADD_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_ADD_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_ADD_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_SUB_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_SUB_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_SUB_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_SMIN_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_SMIN_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_SMIN_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_UMIN_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_UMIN_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_UMIN_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_SMAX_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_SMAX_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_SMAX_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_UMAX_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_UMAX_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_UMAX_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_AND_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_AND_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_AND_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_OR_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_OR_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_OR_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_XOR_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_XOR_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_XOR_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_INC_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_INC_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_INC_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_FLAT__FLAT_ATOMIC_DEC_X2(MachInst iFmt)
-    {
-        return new Inst_FLAT__FLAT_ATOMIC_DEC_X2(&iFmt->iFmt_FLAT);
-    } // decode_OP_FLAT__FLAT_ATOMIC_DEC_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_LOAD(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_LOAD(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_LOAD
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_LOAD_MIP(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_LOAD_MIP(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_LOAD_MIP
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_LOAD_PCK(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_LOAD_PCK(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_LOAD_PCK
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_LOAD_PCK_SGN(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_LOAD_PCK_SGN(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_LOAD_PCK_SGN
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_LOAD_MIP_PCK(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_LOAD_MIP_PCK(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_LOAD_MIP_PCK
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_LOAD_MIP_PCK_SGN(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_LOAD_MIP_PCK_SGN
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_STORE(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_STORE(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_STORE
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_STORE_MIP(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_STORE_MIP(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_STORE_MIP
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_STORE_PCK(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_STORE_PCK(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_STORE_PCK
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_STORE_MIP_PCK(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_STORE_MIP_PCK(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_STORE_MIP_PCK
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GET_RESINFO(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GET_RESINFO(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GET_RESINFO
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_SWAP(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_SWAP(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_SWAP
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_CMPSWAP(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_CMPSWAP(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_CMPSWAP
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_ADD(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_ADD(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_ADD
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_SUB(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_SUB(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_SUB
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_SMIN(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_SMIN(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_SMIN
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_UMIN(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_UMIN(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_UMIN
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_SMAX(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_SMAX(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_SMAX
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_UMAX(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_UMAX(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_UMAX
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_AND(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_AND(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_AND
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_OR(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_OR(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_OR
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_XOR(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_XOR(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_XOR
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_INC(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_INC(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_INC
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_ATOMIC_DEC(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_ATOMIC_DEC(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_ATOMIC_DEC
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_D(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_D(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_D
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_D_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_D_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_D_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_L(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_L(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_L
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_B(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_B(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_B
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_B_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_B_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_B_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_LZ(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_LZ(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_LZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_D(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_D(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_D
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_D_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_D_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_D_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_L(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_L(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_L
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_B(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_B(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_B
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_B_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_B_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_B_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_LZ(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_LZ(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_LZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_D_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_D_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_D_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_D_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_D_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_D_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_L_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_L_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_L_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_B_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_B_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_B_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_B_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_B_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_B_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_LZ_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_LZ_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_LZ_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_D_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_D_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_D_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_D_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_D_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_L_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_L_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_L_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_B_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_B_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_B_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_B_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_B_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_LZ_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_LZ_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_LZ_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_L(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_L(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_L
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_B(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_B(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_B
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_B_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_B_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_B_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_LZ(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_LZ(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_LZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_L(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C_L(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C_L
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_B(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C_B(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C_B
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_B_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C_B_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C_B_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_LZ(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C_LZ(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C_LZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_L_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_L_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_L_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_B_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_B_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_B_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_B_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_B_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_B_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_LZ_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_LZ_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_LZ_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_L_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C_L_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C_L_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_B_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C_B_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C_B_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_B_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C_B_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C_B_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GATHER4_C_LZ_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GATHER4_C_LZ_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GATHER4_C_LZ_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_GET_LOD(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_GET_LOD(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_GET_LOD
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CD(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_CD(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_CD
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CD_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_CD_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_CD_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CD(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_CD(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_CD
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CD_CL(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_CD_CL(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_CD_CL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CD_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_CD_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_CD_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_CD_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_CD_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_CD_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CD_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_CD_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_CD_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MIMG__IMAGE_SAMPLE_C_CD_CL_O(MachInst iFmt)
-    {
-        return new Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O(&iFmt->iFmt_MIMG);
-    } // decode_OP_MIMG__IMAGE_SAMPLE_C_CD_CL_O
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_X(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_X
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XY(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XY
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_X(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_STORE_FORMAT_X(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_STORE_FORMAT_X
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XY(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XY
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XYZ(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XYZW(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_X(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(MachInst iFmt)
-    {
-        return new Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(MachInst iFmt)
-    {
-        return new
-            Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(&iFmt->iFmt_MTBUF);
-    } // decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_X(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_FORMAT_X(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_FORMAT_X
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XY(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_FORMAT_XY(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XY
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XYZ(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XYZW(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_X(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_FORMAT_X(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_FORMAT_X
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_XY(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_FORMAT_XY(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_FORMAT_XY
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_XYZ(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_FORMAT_XYZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_XYZW(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_FORMAT_XYZW
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_X(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_X(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_X
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XY(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_UBYTE(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_UBYTE(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_UBYTE
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_SBYTE(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_SBYTE(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_SBYTE
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_USHORT(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_USHORT(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_USHORT
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_SSHORT(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_SSHORT(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_SSHORT
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_DWORD(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_DWORD(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_DWORD
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_DWORDX2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_DWORDX2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_DWORDX2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_DWORDX3(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_DWORDX3(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_DWORDX3
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_LOAD_DWORDX4(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_LOAD_DWORDX4(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_LOAD_DWORDX4
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_BYTE(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_BYTE(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_BYTE
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_SHORT(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_SHORT(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_SHORT
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_DWORD(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_DWORD(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_DWORD
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_DWORDX2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_DWORDX2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_DWORDX2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_DWORDX3(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_DWORDX3(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_DWORDX3
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_DWORDX4(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_DWORDX4(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_DWORDX4
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_STORE_LDS_DWORD(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_STORE_LDS_DWORD(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_STORE_LDS_DWORD
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_WBINVL1(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_WBINVL1(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_WBINVL1
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_WBINVL1_VOL(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_WBINVL1_VOL(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_WBINVL1_VOL
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SWAP(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_SWAP(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_SWAP
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_CMPSWAP(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_CMPSWAP
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_ADD(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_ADD(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_ADD
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SUB(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_SUB(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_SUB
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SMIN(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_SMIN(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_SMIN
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_UMIN(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_UMIN(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_UMIN
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SMAX(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_SMAX(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_SMAX
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_UMAX(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_UMAX(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_UMAX
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_AND(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_AND(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_AND
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_OR(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_OR(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_OR
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_XOR(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_XOR(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_XOR
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_INC(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_INC(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_INC
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_DEC(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_DEC(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_DEC
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SWAP_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_SWAP_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_ADD_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_ADD_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_ADD_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SUB_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_SUB_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_SUB_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SMIN_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_SMIN_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_UMIN_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_UMIN_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_SMAX_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_SMAX_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_UMAX_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_UMAX_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_AND_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_AND_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_AND_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_OR_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_OR_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_OR_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_XOR_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_XOR_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_XOR_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_INC_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_INC_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_INC_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_MUBUF__BUFFER_ATOMIC_DEC_X2(MachInst iFmt)
-    {
-        return new Inst_MUBUF__BUFFER_ATOMIC_DEC_X2(&iFmt->iFmt_MUBUF);
-    } // decode_OP_MUBUF__BUFFER_ATOMIC_DEC_X2
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_LOAD_DWORD(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_LOAD_DWORD(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_LOAD_DWORD
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_LOAD_DWORDX2(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_LOAD_DWORDX2(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_LOAD_DWORDX2
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_LOAD_DWORDX4(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_LOAD_DWORDX4(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_LOAD_DWORDX4
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_LOAD_DWORDX8(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_LOAD_DWORDX8(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_LOAD_DWORDX8
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_LOAD_DWORDX16(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_LOAD_DWORDX16(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_LOAD_DWORDX16
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_BUFFER_LOAD_DWORD(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_BUFFER_LOAD_DWORD(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_BUFFER_LOAD_DWORD
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_BUFFER_LOAD_DWORDX2(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_BUFFER_LOAD_DWORDX2(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_BUFFER_LOAD_DWORDX2
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_BUFFER_LOAD_DWORDX4(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_BUFFER_LOAD_DWORDX4(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_BUFFER_LOAD_DWORDX4
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_BUFFER_LOAD_DWORDX8(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_BUFFER_LOAD_DWORDX8(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_BUFFER_LOAD_DWORDX8
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_BUFFER_LOAD_DWORDX16(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_BUFFER_LOAD_DWORDX16(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_BUFFER_LOAD_DWORDX16
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_STORE_DWORD(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_STORE_DWORD(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_STORE_DWORD
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_STORE_DWORDX2(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_STORE_DWORDX2(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_STORE_DWORDX2
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_STORE_DWORDX4(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_STORE_DWORDX4(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_STORE_DWORDX4
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_BUFFER_STORE_DWORD(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_BUFFER_STORE_DWORD(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_BUFFER_STORE_DWORD
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_BUFFER_STORE_DWORDX2(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_BUFFER_STORE_DWORDX2(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_BUFFER_STORE_DWORDX2
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_BUFFER_STORE_DWORDX4(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_BUFFER_STORE_DWORDX4(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_BUFFER_STORE_DWORDX4
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_DCACHE_INV(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_DCACHE_INV(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_DCACHE_INV
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_DCACHE_WB(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_DCACHE_WB(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_DCACHE_WB
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_DCACHE_INV_VOL(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_DCACHE_INV_VOL(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_DCACHE_INV_VOL
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_DCACHE_WB_VOL(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_DCACHE_WB_VOL(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_DCACHE_WB_VOL
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_MEMTIME(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_MEMTIME(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_MEMTIME
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_MEMREALTIME(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_MEMREALTIME(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_MEMREALTIME
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_ATC_PROBE(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_ATC_PROBE(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_ATC_PROBE
-
-    GPUStaticInst*
-    Decoder::decode_OP_SMEM__S_ATC_PROBE_BUFFER(MachInst iFmt)
-    {
-        return new Inst_SMEM__S_ATC_PROBE_BUFFER(&iFmt->iFmt_SMEM);
-    } // decode_OP_SMEM__S_ATC_PROBE_BUFFER
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_MOV_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_MOV_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_MOV_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_MOV_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_MOV_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_MOV_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_CMOV_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_CMOV_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_CMOV_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_CMOV_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_CMOV_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_CMOV_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_NOT_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_NOT_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_NOT_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_NOT_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_NOT_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_NOT_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_WQM_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_WQM_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_WQM_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_WQM_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_WQM_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_WQM_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_BREV_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_BREV_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_BREV_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_BREV_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_BREV_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_BREV_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_BCNT0_I32_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_BCNT0_I32_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_BCNT0_I32_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_BCNT0_I32_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_BCNT0_I32_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_BCNT0_I32_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_BCNT1_I32_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_BCNT1_I32_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_BCNT1_I32_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_BCNT1_I32_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_BCNT1_I32_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_BCNT1_I32_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_FF0_I32_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_FF0_I32_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_FF0_I32_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_FF0_I32_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_FF0_I32_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_FF0_I32_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_FF1_I32_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_FF1_I32_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_FF1_I32_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_FF1_I32_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_FF1_I32_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_FF1_I32_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_FLBIT_I32_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_FLBIT_I32_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_FLBIT_I32_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_FLBIT_I32_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_FLBIT_I32_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_FLBIT_I32_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_FLBIT_I32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_FLBIT_I32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_FLBIT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_FLBIT_I32_I64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_FLBIT_I32_I64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_FLBIT_I32_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_SEXT_I32_I8(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_SEXT_I32_I8(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_SEXT_I32_I8
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_SEXT_I32_I16(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_SEXT_I32_I16(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_SEXT_I32_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_BITSET0_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_BITSET0_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_BITSET0_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_BITSET0_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_BITSET0_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_BITSET0_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_BITSET1_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_BITSET1_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_BITSET1_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_BITSET1_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_BITSET1_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_BITSET1_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_GETPC_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_GETPC_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_GETPC_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_SETPC_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_SETPC_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_SETPC_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_SWAPPC_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_SWAPPC_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_SWAPPC_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_RFE_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_RFE_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_RFE_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_AND_SAVEEXEC_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_AND_SAVEEXEC_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_AND_SAVEEXEC_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_OR_SAVEEXEC_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_OR_SAVEEXEC_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_OR_SAVEEXEC_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_XOR_SAVEEXEC_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_XOR_SAVEEXEC_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_XOR_SAVEEXEC_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_ANDN2_SAVEEXEC_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_ANDN2_SAVEEXEC_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_ANDN2_SAVEEXEC_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_ORN2_SAVEEXEC_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_ORN2_SAVEEXEC_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_ORN2_SAVEEXEC_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_NAND_SAVEEXEC_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_NAND_SAVEEXEC_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_NAND_SAVEEXEC_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_NOR_SAVEEXEC_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_NOR_SAVEEXEC_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_NOR_SAVEEXEC_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_XNOR_SAVEEXEC_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_XNOR_SAVEEXEC_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_XNOR_SAVEEXEC_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_QUADMASK_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_QUADMASK_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_QUADMASK_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_QUADMASK_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_QUADMASK_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_QUADMASK_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_MOVRELS_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_MOVRELS_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_MOVRELS_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_MOVRELS_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_MOVRELS_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_MOVRELS_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_MOVRELD_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_MOVRELD_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_MOVRELD_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_MOVRELD_B64(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_MOVRELD_B64(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_MOVRELD_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_CBRANCH_JOIN(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_CBRANCH_JOIN(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_CBRANCH_JOIN
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_ABS_I32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_ABS_I32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_ABS_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_MOV_FED_B32(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_MOV_FED_B32(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_MOV_FED_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOP1__S_SET_GPR_IDX_IDX(MachInst iFmt)
-    {
-        return new Inst_SOP1__S_SET_GPR_IDX_IDX(&iFmt->iFmt_SOP1);
-    } // decode_OP_SOP1__S_SET_GPR_IDX_IDX
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_EQ_I32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_EQ_I32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_EQ_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_LG_I32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_LG_I32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_LG_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_GT_I32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_GT_I32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_GT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_GE_I32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_GE_I32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_GE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_LT_I32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_LT_I32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_LT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_LE_I32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_LE_I32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_LE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_EQ_U32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_EQ_U32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_EQ_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_LG_U32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_LG_U32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_LG_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_GT_U32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_GT_U32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_GT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_GE_U32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_GE_U32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_GE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_LT_U32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_LT_U32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_LT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_LE_U32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_LE_U32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_LE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_BITCMP0_B32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_BITCMP0_B32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_BITCMP0_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_BITCMP1_B32(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_BITCMP1_B32(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_BITCMP1_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_BITCMP0_B64(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_BITCMP0_B64(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_BITCMP0_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_BITCMP1_B64(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_BITCMP1_B64(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_BITCMP1_B64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_SETVSKIP(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_SETVSKIP(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_SETVSKIP
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_SET_GPR_IDX_ON(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_SET_GPR_IDX_ON(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_SET_GPR_IDX_ON
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_EQ_U64(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_EQ_U64(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_EQ_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPC__S_CMP_LG_U64(MachInst iFmt)
-    {
-        return new Inst_SOPC__S_CMP_LG_U64(&iFmt->iFmt_SOPC);
-    } // decode_OP_SOPC__S_CMP_LG_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_NOP(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_NOP(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_NOP
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_ENDPGM(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_ENDPGM(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_ENDPGM
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_BRANCH(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_BRANCH(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_BRANCH
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_WAKEUP(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_WAKEUP(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_WAKEUP
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_CBRANCH_SCC0(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_CBRANCH_SCC0(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_CBRANCH_SCC0
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_CBRANCH_SCC1(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_CBRANCH_SCC1(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_CBRANCH_SCC1
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_CBRANCH_VCCZ(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_CBRANCH_VCCZ(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_CBRANCH_VCCZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_CBRANCH_VCCNZ(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_CBRANCH_VCCNZ(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_CBRANCH_VCCNZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_CBRANCH_EXECZ(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_CBRANCH_EXECZ(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_CBRANCH_EXECZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_CBRANCH_EXECNZ(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_CBRANCH_EXECNZ(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_CBRANCH_EXECNZ
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_BARRIER(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_BARRIER(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_BARRIER
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_SETKILL(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_SETKILL(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_SETKILL
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_WAITCNT(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_WAITCNT(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_WAITCNT
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_SETHALT(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_SETHALT(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_SETHALT
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_SLEEP(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_SLEEP(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_SLEEP
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_SETPRIO(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_SETPRIO(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_SETPRIO
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_SENDMSG(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_SENDMSG(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_SENDMSG
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_SENDMSGHALT(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_SENDMSGHALT(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_SENDMSGHALT
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_TRAP(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_TRAP(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_TRAP
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_ICACHE_INV(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_ICACHE_INV(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_ICACHE_INV
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_INCPERFLEVEL(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_INCPERFLEVEL(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_INCPERFLEVEL
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_DECPERFLEVEL(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_DECPERFLEVEL(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_DECPERFLEVEL
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_TTRACEDATA(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_TTRACEDATA(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_TTRACEDATA
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_CBRANCH_CDBGSYS(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_CBRANCH_CDBGSYS(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_CBRANCH_CDBGSYS
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_CBRANCH_CDBGUSER(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_CBRANCH_CDBGUSER(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_CBRANCH_CDBGUSER
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_CBRANCH_CDBGSYS_OR_USER(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_CBRANCH_CDBGSYS_OR_USER
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_CBRANCH_CDBGSYS_AND_USER(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_CBRANCH_CDBGSYS_AND_USER
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_ENDPGM_SAVED(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_ENDPGM_SAVED(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_ENDPGM_SAVED
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_SET_GPR_IDX_OFF(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_SET_GPR_IDX_OFF(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_SET_GPR_IDX_OFF
-
-    GPUStaticInst*
-    Decoder::decode_OP_SOPP__S_SET_GPR_IDX_MODE(MachInst iFmt)
-    {
-        return new Inst_SOPP__S_SET_GPR_IDX_MODE(&iFmt->iFmt_SOPP);
-    } // decode_OP_SOPP__S_SET_GPR_IDX_MODE
-
-    GPUStaticInst*
-    Decoder::decode_OP_VINTRP__V_INTERP_P1_F32(MachInst iFmt)
-    {
-        return new Inst_VINTRP__V_INTERP_P1_F32(&iFmt->iFmt_VINTRP);
-    } // decode_OP_VINTRP__V_INTERP_P1_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VINTRP__V_INTERP_P2_F32(MachInst iFmt)
-    {
-        return new Inst_VINTRP__V_INTERP_P2_F32(&iFmt->iFmt_VINTRP);
-    } // decode_OP_VINTRP__V_INTERP_P2_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VINTRP__V_INTERP_MOV_F32(MachInst iFmt)
-    {
-        return new Inst_VINTRP__V_INTERP_MOV_F32(&iFmt->iFmt_VINTRP);
-    } // decode_OP_VINTRP__V_INTERP_MOV_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_NOP(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_NOP(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_NOP
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_MOV_B32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_MOV_B32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_MOV_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_READFIRSTLANE_B32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_READFIRSTLANE_B32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_READFIRSTLANE_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_I32_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_I32_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_I32_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F64_I32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F64_I32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F64_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F32_I32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F32_I32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F32_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F32_U32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F32_U32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F32_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_U32_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_U32_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_U32_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_I32_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_I32_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_I32_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_MOV_FED_B32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_MOV_FED_B32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_MOV_FED_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F16_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F16_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F16_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F32_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F32_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F32_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_RPI_I32_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_RPI_I32_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_RPI_I32_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_FLR_I32_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_FLR_I32_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_FLR_I32_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_OFF_F32_I4(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_OFF_F32_I4(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_OFF_F32_I4
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F32_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F32_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F32_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F64_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F64_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F64_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F32_UBYTE0(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F32_UBYTE0(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F32_UBYTE0
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F32_UBYTE1(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F32_UBYTE1(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F32_UBYTE1
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F32_UBYTE2(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F32_UBYTE2(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F32_UBYTE2
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F32_UBYTE3(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F32_UBYTE3(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F32_UBYTE3
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_U32_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_U32_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_U32_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F64_U32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F64_U32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F64_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_TRUNC_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_TRUNC_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_TRUNC_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CEIL_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CEIL_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CEIL_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_RNDNE_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_RNDNE_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_RNDNE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FLOOR_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FLOOR_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FLOOR_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FRACT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FRACT_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FRACT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_TRUNC_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_TRUNC_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_TRUNC_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CEIL_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CEIL_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CEIL_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_RNDNE_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_RNDNE_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_RNDNE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FLOOR_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FLOOR_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FLOOR_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_EXP_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_EXP_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_EXP_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_LOG_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_LOG_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_LOG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_RCP_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_RCP_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_RCP_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_RCP_IFLAG_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_RCP_IFLAG_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_RCP_IFLAG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_RSQ_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_RSQ_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_RSQ_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_RCP_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_RCP_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_RCP_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_RSQ_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_RSQ_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_RSQ_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_SQRT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_SQRT_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_SQRT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_SQRT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_SQRT_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_SQRT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_SIN_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_SIN_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_SIN_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_COS_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_COS_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_COS_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_NOT_B32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_NOT_B32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_NOT_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_BFREV_B32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_BFREV_B32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_BFREV_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FFBH_U32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FFBH_U32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FFBH_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FFBL_B32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FFBL_B32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FFBL_B32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FFBH_I32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FFBH_I32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FFBH_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FREXP_EXP_I32_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FREXP_EXP_I32_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FREXP_EXP_I32_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FREXP_MANT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FREXP_MANT_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FREXP_MANT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FRACT_F64(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FRACT_F64(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FRACT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FREXP_EXP_I32_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FREXP_EXP_I32_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FREXP_EXP_I32_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FREXP_MANT_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FREXP_MANT_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FREXP_MANT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CLREXCP(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CLREXCP(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CLREXCP
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F16_U16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F16_U16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F16_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_F16_I16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_F16_I16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_F16_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_U16_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_U16_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_U16_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CVT_I16_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CVT_I16_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CVT_I16_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_RCP_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_RCP_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_RCP_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_SQRT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_SQRT_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_SQRT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_RSQ_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_RSQ_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_RSQ_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_LOG_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_LOG_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_LOG_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_EXP_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_EXP_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_EXP_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FREXP_MANT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FREXP_MANT_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FREXP_MANT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FREXP_EXP_I16_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FREXP_EXP_I16_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FREXP_EXP_I16_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FLOOR_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FLOOR_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FLOOR_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_CEIL_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_CEIL_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_CEIL_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_TRUNC_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_TRUNC_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_TRUNC_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_RNDNE_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_RNDNE_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_RNDNE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_FRACT_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_FRACT_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_FRACT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_SIN_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_SIN_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_SIN_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_COS_F16(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_COS_F16(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_COS_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_EXP_LEGACY_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_EXP_LEGACY_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_EXP_LEGACY_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOP1__V_LOG_LEGACY_F32(MachInst iFmt)
-    {
-        return new Inst_VOP1__V_LOG_LEGACY_F32(&iFmt->iFmt_VOP1);
-    } // decode_OP_VOP1__V_LOG_LEGACY_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_CLASS_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_CLASS_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_CLASS_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_CLASS_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_CLASS_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_CLASS_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_CLASS_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_CLASS_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_CLASS_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_CLASS_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_CLASS_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_CLASS_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_CLASS_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_CLASS_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_CLASS_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_CLASS_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_CLASS_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_CLASS_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_F_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_F_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_F_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LT_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LT_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_EQ_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_EQ_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_EQ_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LE_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LE_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GT_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GT_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LG_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LG_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LG_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GE_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GE_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_O_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_O_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_O_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_U_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_U_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_U_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NGE_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NGE_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NGE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NLG_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NLG_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NLG_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NGT_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NGT_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NGT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NLE_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NLE_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NLE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NEQ_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NEQ_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NEQ_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NLT_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NLT_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NLT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_TRU_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_TRU_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_TRU_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_F_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_F_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_F_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LT_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LT_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_EQ_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_EQ_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_EQ_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LE_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LE_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GT_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GT_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LG_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LG_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LG_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GE_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GE_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_O_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_O_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_O_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_U_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_U_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_U_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NGE_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NGE_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NGE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NLG_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NLG_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NLG_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NGT_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NGT_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NGT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NLE_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NLE_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NLE_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NEQ_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NEQ_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NEQ_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NLT_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NLT_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NLT_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_TRU_F16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_TRU_F16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_TRU_F16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_F_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_F_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_F_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LT_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LT_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_EQ_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_EQ_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_EQ_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LE_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LE_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GT_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GT_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LG_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LG_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GE_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GE_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_O_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_O_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_O_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_U_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_U_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_U_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NGE_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NGE_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NGE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NLG_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NLG_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NLG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NGT_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NGT_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NGT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NLE_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NLE_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NLE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NEQ_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NEQ_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NEQ_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NLT_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NLT_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NLT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_TRU_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_TRU_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_TRU_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_F_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_F_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_F_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LT_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LT_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_EQ_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_EQ_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_EQ_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LE_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LE_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GT_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GT_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LG_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LG_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GE_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GE_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_O_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_O_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_O_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_U_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_U_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_U_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NGE_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NGE_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NGE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NLG_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NLG_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NLG_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NGT_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NGT_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NGT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NLE_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NLE_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NLE_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NEQ_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NEQ_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NEQ_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NLT_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NLT_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NLT_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_TRU_F32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_TRU_F32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_TRU_F32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_F_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_F_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_F_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LT_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LT_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_EQ_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_EQ_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_EQ_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LE_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LE_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GT_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GT_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LG_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LG_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LG_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GE_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GE_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_O_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_O_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_O_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_U_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_U_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_U_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NGE_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NGE_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NGE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NLG_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NLG_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NLG_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NGT_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NGT_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NGT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NLE_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NLE_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NLE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NEQ_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NEQ_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NEQ_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NLT_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NLT_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NLT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_TRU_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_TRU_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_TRU_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_F_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_F_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_F_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LT_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LT_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_EQ_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_EQ_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_EQ_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LE_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LE_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GT_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GT_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LG_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LG_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LG_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GE_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GE_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_O_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_O_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_O_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_U_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_U_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_U_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NGE_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NGE_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NGE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NLG_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NLG_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NLG_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NGT_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NGT_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NGT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NLE_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NLE_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NLE_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NEQ_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NEQ_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NEQ_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NLT_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NLT_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NLT_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_TRU_F64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_TRU_F64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_TRU_F64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_F_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_F_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_F_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LT_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LT_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LT_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_EQ_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_EQ_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_EQ_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LE_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LE_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GT_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GT_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GT_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NE_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NE_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GE_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GE_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_T_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_T_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_T_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_F_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_F_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_F_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LT_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LT_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LT_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_EQ_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_EQ_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_EQ_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LE_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LE_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GT_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GT_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GT_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NE_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NE_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GE_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GE_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_T_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_T_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_T_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_F_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_F_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_F_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LT_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LT_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LT_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_EQ_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_EQ_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_EQ_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LE_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LE_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GT_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GT_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GT_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NE_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NE_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GE_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GE_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GE_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_T_I16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_T_I16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_T_I16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_F_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_F_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_F_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LT_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LT_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LT_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_EQ_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_EQ_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_EQ_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LE_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LE_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GT_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GT_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GT_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NE_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NE_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GE_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GE_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GE_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_T_U16(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_T_U16(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_T_U16
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_F_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_F_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_F_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LT_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LT_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_EQ_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_EQ_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_EQ_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LE_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LE_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GT_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GT_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NE_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NE_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GE_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GE_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_T_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_T_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_T_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_F_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_F_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_F_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LT_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LT_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_EQ_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_EQ_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_EQ_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LE_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LE_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GT_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GT_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NE_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NE_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GE_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GE_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_T_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_T_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_T_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_F_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_F_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_F_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LT_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LT_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_EQ_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_EQ_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_EQ_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LE_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LE_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GT_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GT_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GT_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NE_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NE_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GE_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GE_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GE_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_T_I32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_T_I32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_T_I32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_F_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_F_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_F_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LT_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LT_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_EQ_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_EQ_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_EQ_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LE_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LE_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GT_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GT_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GT_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NE_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NE_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GE_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GE_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GE_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_T_U32(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_T_U32(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_T_U32
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_F_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_F_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_F_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LT_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LT_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LT_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_EQ_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_EQ_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_EQ_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LE_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LE_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GT_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GT_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GT_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NE_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NE_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GE_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GE_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_T_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_T_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_T_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_F_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_F_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_F_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LT_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LT_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LT_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_EQ_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_EQ_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_EQ_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_LE_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_LE_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_LE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GT_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GT_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GT_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_NE_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_NE_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_NE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_GE_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_GE_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_GE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMP_T_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMP_T_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMP_T_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_F_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_F_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_F_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LT_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LT_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LT_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_EQ_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_EQ_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_EQ_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LE_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LE_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GT_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GT_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GT_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NE_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NE_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GE_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GE_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GE_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_T_I64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_T_I64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_T_I64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_F_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_F_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_F_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LT_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LT_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LT_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_EQ_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_EQ_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_EQ_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_LE_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_LE_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_LE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GT_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GT_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GT_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_NE_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_NE_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_NE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_GE_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_GE_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_GE_U64
-
-    GPUStaticInst*
-    Decoder::decode_OP_VOPC__V_CMPX_T_U64(MachInst iFmt)
-    {
-        return new Inst_VOPC__V_CMPX_T_U64(&iFmt->iFmt_VOPC);
-    } // decode_OP_VOPC__V_CMPX_T_U64
-
-    GPUStaticInst*
-    Decoder::decode_invalid(MachInst iFmt)
-    {
-        fatal("Invalid opcode encountered: %#x\n", iFmt->imm_u32);
-
-        return nullptr;
-    }
-} // namespace Gcn3ISA
-} // namespace gem5
diff --git a/src/arch/amdgpu/gcn3/gpu_decoder.hh b/src/arch/amdgpu/gcn3/gpu_decoder.hh
deleted file mode 100644
index 1dadae6f30..0000000000
--- a/src/arch/amdgpu/gcn3/gpu_decoder.hh
+++ /dev/null
@@ -1,1676 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_DECODER_HH__
-#define __ARCH_GCN3_DECODER_HH__
-
-#include <string>
-#include <vector>
-
-#include "arch/amdgpu/gcn3/gpu_types.hh"
-
-namespace gem5
-{
-
-class GPUStaticInst;
-
-namespace Gcn3ISA
-{
-    class Decoder;
-    union InstFormat;
-
-    using IsaDecodeMethod = GPUStaticInst*(Decoder::*)(MachInst);
-
-    class Decoder
-    {
-      public:
-        Decoder();
-        ~Decoder();
-
-        GPUStaticInst* decode(MachInst mach_inst);
-
-      private:
-        static IsaDecodeMethod tableDecodePrimary[512];
-        static IsaDecodeMethod tableSubDecode_OPU_VOP3[768];
-        static IsaDecodeMethod tableSubDecode_OP_DS[256];
-        static IsaDecodeMethod tableSubDecode_OP_FLAT[128];
-        static IsaDecodeMethod tableSubDecode_OP_MIMG[128];
-        static IsaDecodeMethod tableSubDecode_OP_MTBUF[16];
-        static IsaDecodeMethod tableSubDecode_OP_MUBUF[128];
-        static IsaDecodeMethod tableSubDecode_OP_SMEM[64];
-        static IsaDecodeMethod tableSubDecode_OP_SOP1[256];
-        static IsaDecodeMethod tableSubDecode_OP_SOPC[128];
-        static IsaDecodeMethod tableSubDecode_OP_SOPP[128];
-        static IsaDecodeMethod tableSubDecode_OP_VINTRP[4];
-        static IsaDecodeMethod tableSubDecode_OP_VOP1[256];
-        static IsaDecodeMethod tableSubDecode_OP_VOPC[256];
-
-        GPUStaticInst* decode_OPU_VOP3__V_ADDC_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ADD_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ADD_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ADD_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ADD_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ALIGNBIT_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ALIGNBYTE_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_AND_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ASHRREV_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ASHRREV_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ASHRREV_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_BCNT_U32_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_BFE_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_BFE_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_BFI_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_BFM_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_BFREV_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CEIL_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CEIL_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CEIL_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CLREXCP(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_CLASS_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_CLASS_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_CLASS_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_EQ_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_EQ_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_EQ_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_EQ_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_EQ_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_EQ_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_EQ_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_EQ_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_EQ_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_F_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_F_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_F_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_F_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_F_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_F_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_F_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_F_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_F_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GE_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GE_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GE_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GE_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GE_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GE_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GE_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GE_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GE_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GT_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GT_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GT_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GT_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GT_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GT_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GT_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GT_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_GT_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LE_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LE_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LE_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LE_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LE_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LE_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LE_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LE_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LE_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LG_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LG_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LG_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LT_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LT_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LT_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LT_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LT_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LT_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LT_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LT_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_LT_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NEQ_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NEQ_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NEQ_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NE_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NE_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NE_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NE_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NE_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NE_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NGE_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NGE_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NGE_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NGT_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NGT_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NGT_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NLE_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NLE_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NLE_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NLG_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NLG_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NLG_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NLT_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NLT_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_NLT_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_O_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_O_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_O_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_TRU_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_TRU_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_TRU_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_T_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_T_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_T_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_T_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_T_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_T_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_U_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_U_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMPX_U_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_CLASS_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_CLASS_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_CLASS_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_EQ_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_EQ_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_EQ_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_EQ_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_EQ_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_EQ_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_EQ_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_EQ_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_EQ_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_F_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_F_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_F_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_F_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_F_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_F_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_F_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_F_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_F_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GE_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GE_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GE_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GE_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GE_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GE_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GE_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GE_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GE_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GT_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GT_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GT_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GT_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GT_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GT_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GT_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GT_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_GT_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LE_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LE_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LE_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LE_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LE_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LE_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LE_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LE_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LE_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LG_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LG_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LG_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LT_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LT_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LT_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LT_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LT_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LT_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LT_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LT_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_LT_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NEQ_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NEQ_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NEQ_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NE_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NE_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NE_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NE_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NE_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NE_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NGE_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NGE_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NGE_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NGT_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NGT_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NGT_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NLE_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NLE_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NLE_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NLG_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NLG_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NLG_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NLT_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NLT_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_NLT_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_O_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_O_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_O_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_TRU_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_TRU_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_TRU_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_T_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_T_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_T_I64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_T_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_T_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_T_U64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_U_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_U_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CMP_U_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CNDMASK_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_COS_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_COS_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CUBEID_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CUBEMA_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CUBESC_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CUBETC_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F16_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F16_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F16_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F32_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F32_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F32_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F32_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F32_UBYTE0(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F32_UBYTE1(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F32_UBYTE2(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F32_UBYTE3(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F64_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F64_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_F64_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_FLR_I32_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_I16_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_OFF_F32_I4(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_PKACCUM_U8_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_PKNORM_I16_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_PKNORM_U16_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_PKRTZ_F16_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_PK_I16_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_PK_U16_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_PK_U8_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_RPI_I32_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_U16_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_U32_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_CVT_U32_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_DIV_FMAS_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_DIV_FMAS_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_DIV_SCALE_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_DIV_SCALE_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_EXP_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_EXP_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_EXP_LEGACY_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FFBH_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FFBH_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FFBL_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FLOOR_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FLOOR_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FLOOR_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FMA_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FMA_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FRACT_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FRACT_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FRACT_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FREXP_EXP_I16_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FREXP_EXP_I32_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FREXP_EXP_I32_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FREXP_MANT_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FREXP_MANT_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_FREXP_MANT_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1LL_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1LV_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LDEXP_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LDEXP_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LDEXP_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LERP_U8(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LOG_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LOG_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LOG_LEGACY_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LSHLREV_B16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LSHLREV_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LSHLREV_B64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LSHRREV_B16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LSHRREV_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_LSHRREV_B64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAC_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAC_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAD_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAD_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAD_I32_I24(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAD_I64_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAD_LEGACY_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAD_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAD_U32_U24(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAD_U64_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAX3_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAX3_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAX3_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAX_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAX_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAX_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAX_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAX_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAX_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MAX_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MBCNT_HI_U32_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MBCNT_LO_U32_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MED3_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MED3_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MED3_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MIN3_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MIN3_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MIN3_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MIN_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MIN_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MIN_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MIN_I16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MIN_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MIN_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MIN_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MOV_FED_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MQSAD_PK_U16_U8(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MQSAD_U32_U8(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MSAD_U8(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_HI_I32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_HI_I32_I24(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_HI_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_HI_U32_U24(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_I32_I24(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_LEGACY_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_LO_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_LO_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_MUL_U32_U24(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_NOT_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_OR_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_PERM_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_QSAD_PK_U16_U8(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_RCP_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_RCP_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_RCP_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_RCP_IFLAG_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_READLANE_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_RNDNE_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_RNDNE_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_RNDNE_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_RSQ_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_RSQ_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_RSQ_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SAD_HI_U8(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SAD_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SAD_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SAD_U8(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SIN_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SIN_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SQRT_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SQRT_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SQRT_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUBBREV_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUBB_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUBREV_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUBREV_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUB_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUB_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUB_U16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_TRIG_PREOP_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_TRUNC_F16(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_TRUNC_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_TRUNC_F64(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_WRITELANE_B32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_XOR_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_ADD_F32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_ADD_RTN_F32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_ADD_RTN_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_ADD_RTN_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_ADD_SRC2_F32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_ADD_SRC2_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_ADD_SRC2_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_ADD_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_ADD_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_AND_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_AND_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_AND_RTN_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_AND_RTN_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_AND_SRC2_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_AND_SRC2_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_APPEND(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_BPERMUTE_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_CMPST_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_CMPST_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_CMPST_F32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_CMPST_F64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_CMPST_RTN_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_CMPST_RTN_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_CMPST_RTN_F32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_CMPST_RTN_F64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_CONDXCHG32_RTN_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_CONSUME(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_DEC_RTN_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_DEC_RTN_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_DEC_SRC2_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_DEC_SRC2_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_DEC_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_DEC_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_GWS_BARRIER(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_GWS_INIT(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_GWS_SEMA_BR(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_GWS_SEMA_P(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_GWS_SEMA_RELEASE_ALL(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_GWS_SEMA_V(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_INC_RTN_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_INC_RTN_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_INC_SRC2_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_INC_SRC2_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_INC_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_INC_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_F32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_F64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_I32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_I64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_RTN_F32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_RTN_F64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_RTN_I32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_RTN_I64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_RTN_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_RTN_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_SRC2_F32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_SRC2_F64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_SRC2_I32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_SRC2_I64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_SRC2_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_SRC2_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MAX_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_F32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_F64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_I32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_I64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_RTN_F32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_RTN_F64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_RTN_I32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_RTN_I64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_RTN_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_RTN_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_SRC2_F32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_SRC2_F64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_SRC2_I32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_SRC2_I64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_SRC2_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_SRC2_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MIN_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MSKOR_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MSKOR_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MSKOR_RTN_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_MSKOR_RTN_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_NOP(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_ORDERED_COUNT(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_OR_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_OR_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_OR_RTN_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_OR_RTN_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_OR_SRC2_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_OR_SRC2_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_PERMUTE_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ2ST64_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ2ST64_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ2_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ2_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ_B128(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ_B96(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ_I16(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ_I8(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ_U16(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_READ_U8(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_RSUB_RTN_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_RSUB_RTN_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_RSUB_SRC2_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_RSUB_SRC2_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_RSUB_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_RSUB_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_SUB_RTN_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_SUB_RTN_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_SUB_SRC2_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_SUB_SRC2_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_SUB_U32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_SUB_U64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_SWIZZLE_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRAP_RTN_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE2ST64_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE2ST64_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE2_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE2_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE_B128(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE_B16(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE_B8(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE_B96(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE_SRC2_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRITE_SRC2_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRXCHG2ST64_RTN_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRXCHG2ST64_RTN_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRXCHG2_RTN_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRXCHG2_RTN_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRXCHG_RTN_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_WRXCHG_RTN_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_XOR_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_XOR_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_XOR_RTN_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_XOR_RTN_B64(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_XOR_SRC2_B32(MachInst);
-        GPUStaticInst* decode_OP_DS__DS_XOR_SRC2_B64(MachInst);
-        GPUStaticInst* decode_OP_EXP(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_ADD(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_ADD_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_AND(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_AND_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_DEC(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_DEC_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_INC(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_INC_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_OR(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_OR_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SMAX(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SMAX_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SMIN(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SMIN_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SUB(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SUB_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SWAP(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SWAP_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_UMAX(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_UMAX_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_UMIN(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_UMIN_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_XOR(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_XOR_X2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_LOAD_DWORD(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_LOAD_DWORDX2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_LOAD_DWORDX3(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_LOAD_DWORDX4(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_LOAD_SBYTE(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_LOAD_SSHORT(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_LOAD_UBYTE(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_LOAD_USHORT(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_STORE_BYTE(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_STORE_DWORD(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_STORE_DWORDX2(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_STORE_DWORDX3(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_STORE_DWORDX4(MachInst);
-        GPUStaticInst* decode_OP_FLAT__FLAT_STORE_SHORT(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_ADD(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_AND(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_CMPSWAP(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_DEC(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_INC(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_OR(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_SMAX(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_SMIN(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_SUB(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_SWAP(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_UMAX(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_UMIN(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_ATOMIC_XOR(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_B(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_B_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_B_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_B_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C_B(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C_B_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C_B_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C_B_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C_L(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C_LZ(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C_LZ_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C_L_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_C_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_L(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_LZ(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_LZ_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_L_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GATHER4_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GET_LOD(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_GET_RESINFO(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_LOAD(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_LOAD_MIP(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_LOAD_MIP_PCK(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_LOAD_MIP_PCK_SGN(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_LOAD_PCK(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_LOAD_PCK_SGN(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_B(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_B_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_B_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_B_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_CD(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_CD_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_CD_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_CD_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_B(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_B_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_B_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_B_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_CD(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_CD_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_CD_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_CD_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_D(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_D_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_D_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_D_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_L(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_LZ(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_LZ_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_L_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_C_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_D(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_D_CL(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_D_CL_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_D_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_L(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_LZ(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_LZ_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_L_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_SAMPLE_O(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_STORE(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_STORE_MIP(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_STORE_MIP_PCK(MachInst);
-        GPUStaticInst* decode_OP_MIMG__IMAGE_STORE_PCK(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_X(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XY(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_X(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(MachInst);
-        GPUStaticInst*
-            decode_OP_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_STORE_FORMAT_X(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XY(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XYZ(MachInst);
-        GPUStaticInst* decode_OP_MTBUF__TBUFFER_STORE_FORMAT_XYZW(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_ADD(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_ADD_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_AND(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_AND_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_CMPSWAP(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_DEC(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_DEC_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_INC(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_INC_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_OR(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_OR_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_SMAX(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_SMAX_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_SMIN(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_SMIN_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_SUB(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_SUB_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_SWAP(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_SWAP_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_UMAX(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_UMAX_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_UMIN(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_UMIN_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_XOR(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_ATOMIC_XOR_X2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_DWORD(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_DWORDX2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_DWORDX3(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_DWORDX4(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_X(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_FORMAT_X(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XY(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XYZ(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_FORMAT_XYZW(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_SBYTE(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_SSHORT(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_UBYTE(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_LOAD_USHORT(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_BYTE(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_DWORD(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_DWORDX2(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_DWORDX3(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_DWORDX4(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_X(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XY(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_FORMAT_X(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_FORMAT_XY(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_FORMAT_XYZ(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_FORMAT_XYZW(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_LDS_DWORD(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_STORE_SHORT(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_WBINVL1(MachInst);
-        GPUStaticInst* decode_OP_MUBUF__BUFFER_WBINVL1_VOL(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_ATC_PROBE(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_ATC_PROBE_BUFFER(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_BUFFER_LOAD_DWORD(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_BUFFER_LOAD_DWORDX16(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_BUFFER_LOAD_DWORDX2(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_BUFFER_LOAD_DWORDX4(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_BUFFER_LOAD_DWORDX8(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_BUFFER_STORE_DWORD(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_BUFFER_STORE_DWORDX2(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_BUFFER_STORE_DWORDX4(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_DCACHE_INV(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_DCACHE_INV_VOL(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_DCACHE_WB(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_DCACHE_WB_VOL(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_LOAD_DWORD(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_LOAD_DWORDX16(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_LOAD_DWORDX2(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_LOAD_DWORDX4(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_LOAD_DWORDX8(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_MEMREALTIME(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_MEMTIME(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_STORE_DWORD(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_STORE_DWORDX2(MachInst);
-        GPUStaticInst* decode_OP_SMEM__S_STORE_DWORDX4(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_ABS_I32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_ANDN2_SAVEEXEC_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_AND_SAVEEXEC_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_BCNT0_I32_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_BCNT0_I32_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_BCNT1_I32_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_BCNT1_I32_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_BITSET0_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_BITSET0_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_BITSET1_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_BITSET1_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_BREV_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_BREV_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_CBRANCH_JOIN(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_CMOV_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_CMOV_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_FF0_I32_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_FF0_I32_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_FF1_I32_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_FF1_I32_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_FLBIT_I32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_FLBIT_I32_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_FLBIT_I32_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_FLBIT_I32_I64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_GETPC_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_MOVRELD_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_MOVRELD_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_MOVRELS_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_MOVRELS_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_MOV_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_MOV_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_MOV_FED_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_NAND_SAVEEXEC_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_NOR_SAVEEXEC_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_NOT_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_NOT_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_ORN2_SAVEEXEC_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_OR_SAVEEXEC_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_QUADMASK_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_QUADMASK_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_RFE_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_SETPC_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_SET_GPR_IDX_IDX(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_SEXT_I32_I16(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_SEXT_I32_I8(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_SWAPPC_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_WQM_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_WQM_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_XNOR_SAVEEXEC_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP1__S_XOR_SAVEEXEC_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_ABSDIFF_I32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_ADDC_U32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_ADD_I32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_ADD_U32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_ANDN2_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_ANDN2_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_AND_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_AND_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_ASHR_I32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_ASHR_I64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_BFE_I32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_BFE_I64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_BFE_U32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_BFE_U64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_BFM_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_BFM_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_CBRANCH_G_FORK(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_CSELECT_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_CSELECT_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_LSHL_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_LSHL_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_LSHR_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_LSHR_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_MAX_I32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_MAX_U32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_MIN_I32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_MIN_U32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_MUL_I32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_NAND_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_NAND_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_NOR_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_NOR_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_ORN2_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_ORN2_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_OR_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_OR_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_RFE_RESTORE_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_SUBB_U32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_SUB_I32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_SUB_U32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_XNOR_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_XNOR_B64(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_XOR_B32(MachInst);
-        GPUStaticInst* decode_OP_SOP2__S_XOR_B64(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_BITCMP0_B32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_BITCMP0_B64(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_BITCMP1_B32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_BITCMP1_B64(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_EQ_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_EQ_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_EQ_U64(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_GE_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_GE_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_GT_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_GT_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_LE_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_LE_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_LG_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_LG_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_LG_U64(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_LT_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_CMP_LT_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_SETVSKIP(MachInst);
-        GPUStaticInst* decode_OP_SOPC__S_SET_GPR_IDX_ON(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_ADDK_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CBRANCH_I_FORK(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMOVK_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_EQ_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_EQ_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_GE_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_GE_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_GT_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_GT_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_LE_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_LE_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_LG_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_LG_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_LT_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_CMPK_LT_U32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_GETREG_B32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_MOVK_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_MULK_I32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_SETREG_B32(MachInst);
-        GPUStaticInst* decode_OP_SOPK__S_SETREG_IMM32_B32(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_BARRIER(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_BRANCH(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_CBRANCH_CDBGSYS(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_CBRANCH_CDBGSYS_AND_USER(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_CBRANCH_CDBGSYS_OR_USER(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_CBRANCH_CDBGUSER(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_CBRANCH_EXECNZ(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_CBRANCH_EXECZ(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_CBRANCH_SCC0(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_CBRANCH_SCC1(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_CBRANCH_VCCNZ(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_CBRANCH_VCCZ(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_DECPERFLEVEL(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_ENDPGM(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_ENDPGM_SAVED(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_ICACHE_INV(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_INCPERFLEVEL(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_NOP(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_SENDMSG(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_SENDMSGHALT(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_SETHALT(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_SETKILL(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_SETPRIO(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_SET_GPR_IDX_MODE(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_SET_GPR_IDX_OFF(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_SLEEP(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_TRAP(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_TTRACEDATA(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_WAITCNT(MachInst);
-        GPUStaticInst* decode_OP_SOPP__S_WAKEUP(MachInst);
-        GPUStaticInst* decode_OP_VINTRP__V_INTERP_MOV_F32(MachInst);
-        GPUStaticInst* decode_OP_VINTRP__V_INTERP_P1_F32(MachInst);
-        GPUStaticInst* decode_OP_VINTRP__V_INTERP_P2_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_BFREV_B32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CEIL_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CEIL_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CEIL_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CLREXCP(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_COS_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_COS_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F16_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F16_I16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F16_U16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F32_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F32_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F32_I32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F32_U32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F32_UBYTE0(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F32_UBYTE1(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F32_UBYTE2(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F32_UBYTE3(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F64_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F64_I32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_F64_U32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_FLR_I32_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_I16_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_I32_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_I32_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_OFF_F32_I4(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_RPI_I32_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_U16_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_U32_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_CVT_U32_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_EXP_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_EXP_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_EXP_LEGACY_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FFBH_I32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FFBH_U32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FFBL_B32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FLOOR_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FLOOR_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FLOOR_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FRACT_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FRACT_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FRACT_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FREXP_EXP_I16_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FREXP_EXP_I32_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FREXP_EXP_I32_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FREXP_MANT_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FREXP_MANT_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_FREXP_MANT_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_LOG_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_LOG_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_LOG_LEGACY_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_MOV_B32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_MOV_FED_B32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_NOP(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_NOT_B32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_RCP_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_RCP_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_RCP_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_RCP_IFLAG_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_READFIRSTLANE_B32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_RNDNE_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_RNDNE_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_RNDNE_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_RSQ_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_RSQ_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_RSQ_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_SIN_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_SIN_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_SQRT_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_SQRT_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_SQRT_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_TRUNC_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_TRUNC_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP1__V_TRUNC_F64(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_ADDC_U32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_ADD_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_ADD_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_ADD_U16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_ADD_U32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_AND_B32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_ASHRREV_I16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_ASHRREV_I32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_CNDMASK_B32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_LDEXP_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_LSHLREV_B16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_LSHLREV_B32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_LSHRREV_B16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_LSHRREV_B32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MAC_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MAC_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MADAK_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MADAK_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MADMK_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MADMK_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MAX_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MAX_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MAX_I16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MAX_I32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MAX_U16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MAX_U32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MIN_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MIN_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MIN_I16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MIN_I32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MIN_U16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MIN_U32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MUL_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MUL_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MUL_HI_I32_I24(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MUL_HI_U32_U24(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MUL_I32_I24(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MUL_LEGACY_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MUL_LO_U16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_MUL_U32_U24(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_OR_B32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_SUBBREV_U32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_SUBB_U32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_SUBREV_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_SUBREV_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_SUBREV_U16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_SUBREV_U32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_SUB_F16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_SUB_F32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_SUB_U16(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_SUB_U32(MachInst);
-        GPUStaticInst* decode_OP_VOP2__V_XOR_B32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_CLASS_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_CLASS_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_CLASS_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_EQ_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_EQ_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_EQ_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_EQ_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_EQ_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_EQ_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_EQ_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_EQ_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_EQ_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_F_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_F_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_F_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_F_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_F_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_F_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_F_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_F_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_F_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GE_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GE_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GE_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GE_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GE_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GE_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GE_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GE_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GE_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GT_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GT_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GT_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GT_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GT_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GT_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GT_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GT_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_GT_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LE_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LE_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LE_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LE_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LE_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LE_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LE_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LE_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LE_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LG_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LG_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LG_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LT_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LT_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LT_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LT_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LT_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LT_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LT_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LT_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_LT_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NEQ_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NEQ_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NEQ_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NE_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NE_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NE_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NE_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NE_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NE_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NGE_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NGE_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NGE_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NGT_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NGT_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NGT_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NLE_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NLE_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NLE_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NLG_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NLG_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NLG_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NLT_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NLT_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_NLT_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_O_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_O_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_O_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_TRU_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_TRU_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_TRU_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_T_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_T_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_T_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_T_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_T_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_T_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_U_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_U_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMPX_U_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_CLASS_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_CLASS_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_CLASS_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_EQ_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_EQ_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_EQ_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_EQ_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_EQ_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_EQ_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_EQ_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_EQ_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_EQ_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_F_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_F_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_F_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_F_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_F_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_F_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_F_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_F_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_F_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GE_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GE_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GE_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GE_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GE_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GE_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GE_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GE_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GE_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GT_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GT_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GT_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GT_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GT_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GT_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GT_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GT_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_GT_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LE_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LE_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LE_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LE_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LE_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LE_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LE_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LE_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LE_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LG_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LG_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LG_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LT_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LT_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LT_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LT_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LT_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LT_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LT_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LT_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_LT_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NEQ_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NEQ_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NEQ_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NE_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NE_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NE_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NE_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NE_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NE_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NGE_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NGE_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NGE_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NGT_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NGT_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NGT_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NLE_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NLE_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NLE_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NLG_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NLG_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NLG_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NLT_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NLT_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_NLT_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_O_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_O_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_O_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_TRU_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_TRU_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_TRU_F64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_T_I16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_T_I32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_T_I64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_T_U16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_T_U32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_T_U64(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_U_F16(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_U_F32(MachInst);
-        GPUStaticInst* decode_OP_VOPC__V_CMP_U_F64(MachInst);
-        GPUStaticInst* subDecode_OPU_VOP3(MachInst);
-        GPUStaticInst* subDecode_OP_DS(MachInst);
-        GPUStaticInst* subDecode_OP_FLAT(MachInst);
-        GPUStaticInst* subDecode_OP_MIMG(MachInst);
-        GPUStaticInst* subDecode_OP_MTBUF(MachInst);
-        GPUStaticInst* subDecode_OP_MUBUF(MachInst);
-        GPUStaticInst* subDecode_OP_SMEM(MachInst);
-        GPUStaticInst* subDecode_OP_SOP1(MachInst);
-        GPUStaticInst* subDecode_OP_SOPC(MachInst);
-        GPUStaticInst* subDecode_OP_SOPP(MachInst);
-        GPUStaticInst* subDecode_OP_VINTRP(MachInst);
-        GPUStaticInst* subDecode_OP_VOP1(MachInst);
-        GPUStaticInst* subDecode_OP_VOPC(MachInst);
-        GPUStaticInst* decode_invalid(MachInst);
-    };
-
-    struct InFmt_DS
-    {
-        unsigned int   OFFSET0 : 8;
-        unsigned int   OFFSET1 : 8;
-        unsigned int       GDS : 1;
-        unsigned int        OP : 8;
-        unsigned int    pad_25 : 1;
-        unsigned int  ENCODING : 6;
-    };
-
-    struct InFmt_DS_1
-    {
-        unsigned int      ADDR : 8;
-        unsigned int     DATA0 : 8;
-        unsigned int     DATA1 : 8;
-        unsigned int      VDST : 8;
-    };
-
-    struct InFmt_EXP
-    {
-        unsigned int        EN : 4;
-        unsigned int       TGT : 6;
-        unsigned int     COMPR : 1;
-        unsigned int      DONE : 1;
-        unsigned int        VM : 1;
-        unsigned int pad_13_25 : 13;
-        unsigned int  ENCODING : 6;
-    };
-
-    struct InFmt_EXP_1
-    {
-        unsigned int     VSRC0 : 8;
-        unsigned int     VSRC1 : 8;
-        unsigned int     VSRC2 : 8;
-        unsigned int     VSRC3 : 8;
-    };
-
-    struct InFmt_FLAT
-    {
-        unsigned int  pad_0_15 : 16;
-        unsigned int       GLC : 1;
-        unsigned int       SLC : 1;
-        unsigned int        OP : 7;
-        unsigned int    pad_25 : 1;
-        unsigned int  ENCODING : 6;
-    };
-
-    struct InFmt_FLAT_1
-    {
-        unsigned int      ADDR : 8;
-        unsigned int      DATA : 8;
-        unsigned int pad_16_22 : 7;
-        unsigned int       TFE : 1;
-        unsigned int      VDST : 8;
-    };
-
-    struct InFmt_INST
-    {
-        unsigned int  ENCODING : 32;
-    };
-
-    struct InFmt_MIMG
-    {
-        unsigned int   pad_0_7 : 8;
-        unsigned int     DMASK : 4;
-        unsigned int     UNORM : 1;
-        unsigned int       GLC : 1;
-        unsigned int        DA : 1;
-        unsigned int      R128 : 1;
-        unsigned int       TFE : 1;
-        unsigned int       LWE : 1;
-        unsigned int        OP : 7;
-        unsigned int       SLC : 1;
-        unsigned int  ENCODING : 6;
-    };
-
-    struct InFmt_MIMG_1
-    {
-        unsigned int     VADDR : 8;
-        unsigned int     VDATA : 8;
-        unsigned int     SRSRC : 5;
-        unsigned int     SSAMP : 5;
-        unsigned int pad_26_30 : 5;
-        unsigned int       D16 : 1;
-    };
-
-    struct InFmt_MTBUF
-    {
-        unsigned int    OFFSET : 12;
-        unsigned int     OFFEN : 1;
-        unsigned int     IDXEN : 1;
-        unsigned int       GLC : 1;
-        unsigned int        OP : 4;
-        unsigned int      DFMT : 4;
-        unsigned int      NFMT : 3;
-        unsigned int  ENCODING : 6;
-    };
-
-    struct InFmt_MTBUF_1
-    {
-        unsigned int     VADDR : 8;
-        unsigned int     VDATA : 8;
-        unsigned int     SRSRC : 5;
-        unsigned int    pad_21 : 1;
-        unsigned int       SLC : 1;
-        unsigned int       TFE : 1;
-        unsigned int   SOFFSET : 8;
-    };
-
-    struct InFmt_MUBUF
-    {
-        unsigned int    OFFSET : 12;
-        unsigned int     OFFEN : 1;
-        unsigned int     IDXEN : 1;
-        unsigned int       GLC : 1;
-        unsigned int    pad_15 : 1;
-        unsigned int       LDS : 1;
-        unsigned int       SLC : 1;
-        unsigned int        OP : 7;
-        unsigned int    pad_25 : 1;
-        unsigned int  ENCODING : 6;
-    };
-
-    struct InFmt_MUBUF_1
-    {
-        unsigned int     VADDR : 8;
-        unsigned int     VDATA : 8;
-        unsigned int     SRSRC : 5;
-        unsigned int pad_21_22 : 2;
-        unsigned int       TFE : 1;
-        unsigned int   SOFFSET : 8;
-    };
-
-    struct InFmt_SMEM
-    {
-        unsigned int     SBASE : 6;
-        unsigned int     SDATA : 7;
-        unsigned int pad_13_15 : 3;
-        unsigned int       GLC : 1;
-        unsigned int       IMM : 1;
-        unsigned int        OP : 8;
-        unsigned int  ENCODING : 6;
-    };
-
-    struct InFmt_SMEM_1
-    {
-        unsigned int    OFFSET : 20;
-    };
-
-    struct InFmt_SOP1
-    {
-        unsigned int     SSRC0 : 8;
-        unsigned int        OP : 8;
-        unsigned int      SDST : 7;
-        unsigned int  ENCODING : 9;
-    };
-
-    struct InFmt_SOP2
-    {
-        unsigned int     SSRC0 : 8;
-        unsigned int     SSRC1 : 8;
-        unsigned int      SDST : 7;
-        unsigned int        OP : 7;
-        unsigned int  ENCODING : 2;
-    };
-
-    struct InFmt_SOPC
-    {
-        unsigned int     SSRC0 : 8;
-        unsigned int     SSRC1 : 8;
-        unsigned int        OP : 7;
-        unsigned int  ENCODING : 9;
-    };
-
-    struct InFmt_SOPK
-    {
-        unsigned int    SIMM16 : 16;
-        unsigned int      SDST : 7;
-        unsigned int        OP : 5;
-        unsigned int  ENCODING : 4;
-    };
-
-    struct InFmt_SOPP
-    {
-        unsigned int    SIMM16 : 16;
-        unsigned int        OP : 7;
-        unsigned int  ENCODING : 9;
-    };
-
-    struct InFmt_VINTRP
-    {
-        unsigned int      VSRC : 8;
-        unsigned int  ATTRCHAN : 2;
-        unsigned int      ATTR : 6;
-        unsigned int        OP : 2;
-        unsigned int      VDST : 8;
-        unsigned int  ENCODING : 6;
-    };
-
-    struct InFmt_VOP1
-    {
-        unsigned int      SRC0 : 9;
-        unsigned int        OP : 8;
-        unsigned int      VDST : 8;
-        unsigned int  ENCODING : 7;
-    };
-
-    struct InFmt_VOP2
-    {
-        unsigned int      SRC0 : 9;
-        unsigned int     VSRC1 : 8;
-        unsigned int      VDST : 8;
-        unsigned int        OP : 6;
-        unsigned int  ENCODING : 1;
-    };
-
-    struct InFmt_VOP3
-    {
-        unsigned int      VDST : 8;
-        unsigned int       ABS : 3;
-        unsigned int pad_11_14 : 4;
-        unsigned int     CLAMP : 1;
-        unsigned int        OP : 10;
-        unsigned int  ENCODING : 6;
-    };
-
-    struct InFmt_VOP3_1
-    {
-        unsigned int      SRC0 : 9;
-        unsigned int      SRC1 : 9;
-        unsigned int      SRC2 : 9;
-        unsigned int      OMOD : 2;
-        unsigned int       NEG : 3;
-    };
-
-    struct InFmt_VOP3_SDST_ENC
-    {
-        unsigned int      VDST : 8;
-        unsigned int      SDST : 7;
-        unsigned int     CLAMP : 1;
-        unsigned int        OP : 10;
-        unsigned int  ENCODING : 6;
-    };
-
-    struct InFmt_VOPC
-    {
-        unsigned int      SRC0 : 9;
-        unsigned int     VSRC1 : 8;
-        unsigned int        OP : 8;
-        unsigned int  ENCODING : 7;
-    };
-
-    struct InFmt_VOP_DPP
-    {
-        unsigned int       SRC0 : 8;
-        unsigned int   DPP_CTRL : 9;
-        unsigned int  pad_17_18 : 2;
-        unsigned int BOUND_CTRL : 1;
-        unsigned int   SRC0_NEG : 1;
-        unsigned int   SRC0_ABS : 1;
-        unsigned int   SRC1_NEG : 1;
-        unsigned int   SRC1_ABS : 1;
-        unsigned int  BANK_MASK : 4;
-        unsigned int   ROW_MASK : 4;
-    };
-
-    struct InFmt_VOP_SDWA
-    {
-        unsigned int       SRC0 : 8;
-        unsigned int    DST_SEL : 3;
-        unsigned int DST_UNUSED : 2;
-        unsigned int      CLAMP : 1;
-        unsigned int  pad_14_15 : 2;
-        unsigned int   SRC0_SEL : 3;
-        unsigned int  SRC0_SEXT : 1;
-        unsigned int   SRC0_NEG : 1;
-        unsigned int   SRC0_ABS : 1;
-        unsigned int  pad_22_23 : 2;
-        unsigned int   SRC1_SEL : 3;
-        unsigned int  SRC1_SEXT : 1;
-        unsigned int   SRC1_NEG : 1;
-        unsigned int   SRC1_ABS : 1;
-    };
-
-    union InstFormat
-    {
-        InFmt_DS            iFmt_DS;
-        InFmt_DS_1          iFmt_DS_1;
-        InFmt_EXP           iFmt_EXP;
-        InFmt_EXP_1         iFmt_EXP_1;
-        InFmt_FLAT          iFmt_FLAT;
-        InFmt_FLAT_1        iFmt_FLAT_1;
-        InFmt_INST          iFmt_INST;
-        InFmt_MIMG          iFmt_MIMG;
-        InFmt_MIMG_1        iFmt_MIMG_1;
-        InFmt_MTBUF         iFmt_MTBUF;
-        InFmt_MTBUF_1       iFmt_MTBUF_1;
-        InFmt_MUBUF         iFmt_MUBUF;
-        InFmt_MUBUF_1       iFmt_MUBUF_1;
-        InFmt_SMEM          iFmt_SMEM;
-        InFmt_SMEM_1        iFmt_SMEM_1;
-        InFmt_SOP1          iFmt_SOP1;
-        InFmt_SOP2          iFmt_SOP2;
-        InFmt_SOPC          iFmt_SOPC;
-        InFmt_SOPK          iFmt_SOPK;
-        InFmt_SOPP          iFmt_SOPP;
-        InFmt_VINTRP        iFmt_VINTRP;
-        InFmt_VOP1          iFmt_VOP1;
-        InFmt_VOP2          iFmt_VOP2;
-        InFmt_VOP3          iFmt_VOP3;
-        InFmt_VOP3_1        iFmt_VOP3_1;
-        InFmt_VOP3_SDST_ENC iFmt_VOP3_SDST_ENC;
-        InFmt_VOPC          iFmt_VOPC;
-        InFmt_VOP_DPP       iFmt_VOP_DPP;
-        InFmt_VOP_SDWA      iFmt_VOP_SDWA;
-        uint32_t            imm_u32;
-        float               imm_f32;
-    }; // union InstFormat
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif // __ARCH_GCN3_DECODER_HH__
diff --git a/src/arch/amdgpu/gcn3/gpu_isa.hh b/src/arch/amdgpu/gcn3/gpu_isa.hh
deleted file mode 100644
index 4d5aba46c7..0000000000
--- a/src/arch/amdgpu/gcn3/gpu_isa.hh
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_GPU_ISA_HH__
-#define __ARCH_GCN3_GPU_ISA_HH__
-
-#include <array>
-#include <type_traits>
-
-#include "arch/amdgpu/common/tlb.hh"
-#include "arch/amdgpu/gcn3/gpu_registers.hh"
-#include "gpu-compute/dispatcher.hh"
-#include "gpu-compute/hsa_queue_entry.hh"
-#include "gpu-compute/misc.hh"
-
-namespace gem5
-{
-
-class Wavefront;
-
-namespace Gcn3ISA
-{
-    class GPUISA
-    {
-      public:
-        GPUISA(Wavefront &wf);
-
-        template<typename T> T
-        readConstVal(int opIdx) const
-        {
-            panic_if(!std::is_integral_v<T>, "Constant values must "
-                     "be an integer.\n");
-            T val(0);
-
-            if (isPosConstVal(opIdx)) {
-                val = (T)readPosConstReg(opIdx);
-            }
-
-            if (isNegConstVal(opIdx)) {
-                val = (T)readNegConstReg(opIdx);
-            }
-
-            return val;
-        }
-
-        ScalarRegU32 readMiscReg(int opIdx) const;
-        void writeMiscReg(int opIdx, ScalarRegU32 operandVal);
-        bool hasScalarUnit() const { return true; }
-        void advancePC(GPUDynInstPtr gpuDynInst);
-
-      private:
-        ScalarRegU32 readPosConstReg(int opIdx) const
-        {
-            return posConstRegs[opIdx - REG_INT_CONST_POS_MIN];
-        }
-
-        ScalarRegI32 readNegConstReg(int opIdx) const
-        {
-            return negConstRegs[opIdx - REG_INT_CONST_NEG_MIN];
-        }
-
-        static const std::array<const ScalarRegU32, NumPosConstRegs>
-            posConstRegs;
-        static const std::array<const ScalarRegI32, NumNegConstRegs>
-            negConstRegs;
-
-        // parent wavefront
-        Wavefront &wavefront;
-
-        // shader status bits
-        StatusReg statusReg;
-        // memory descriptor reg
-        ScalarRegU32 m0;
-    };
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif // __ARCH_GCN3_GPU_ISA_HH__
diff --git a/src/arch/amdgpu/gcn3/gpu_mem_helpers.hh b/src/arch/amdgpu/gcn3/gpu_mem_helpers.hh
deleted file mode 100644
index 05299e1a0d..0000000000
--- a/src/arch/amdgpu/gcn3/gpu_mem_helpers.hh
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
-#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__
-
-#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
-#include "arch/amdgpu/gcn3/insts/op_encodings.hh"
-#include "debug/GPUMem.hh"
-#include "gpu-compute/gpu_dyn_inst.hh"
-
-namespace gem5
-{
-
-/**
- * Helper function for instructions declared in op_encodings.  This function
- * takes in all of the arguments for a given memory request we are trying to
- * initialize, then submits the request or requests depending on if the
- * original request is aligned or unaligned.
- */
-template<typename T, int N>
-inline void
-initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
-                 bool is_atomic=false)
-{
-    // local variables
-    int req_size = N * sizeof(T);
-    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
-    Addr vaddr = 0, split_addr = 0;
-    bool misaligned_acc = false;
-    RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
-    PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;
-
-    gpuDynInst->resetEntireStatusVector();
-    for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
-        if (gpuDynInst->exec_mask[lane]) {
-            vaddr = gpuDynInst->addr[lane];
-
-            /**
-             * the base address of the cache line where the the last
-             * byte of the request will be stored.
-             */
-            split_addr = roundDown(vaddr + req_size - 1, block_size);
-
-            assert(split_addr <= vaddr || split_addr - vaddr < block_size);
-            /**
-             * if the base cache line address of the last byte is
-             * greater than the address of the first byte then we have
-             * a misaligned access.
-             */
-            misaligned_acc = split_addr > vaddr;
-
-            if (is_atomic) {
-                // make sure request is word aligned
-                assert((vaddr & 0x3) == 0);
-
-                // a given lane's atomic can't cross cache lines
-                assert(!misaligned_acc);
-
-                req = std::make_shared<Request>(vaddr, sizeof(T), 0,
-                    gpuDynInst->computeUnit()->requestorId(), 0,
-                    gpuDynInst->wfDynId,
-                    gpuDynInst->makeAtomicOpFunctor<T>(
-                        &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
-                        &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));
-            } else {
-                req = std::make_shared<Request>(vaddr, req_size, 0,
-                                  gpuDynInst->computeUnit()->requestorId(), 0,
-                                  gpuDynInst->wfDynId);
-            }
-
-            if (misaligned_acc) {
-                gpuDynInst->setStatusVector(lane, 2);
-                req->splitOnVaddr(split_addr, req1, req2);
-                gpuDynInst->setRequestFlags(req1);
-                gpuDynInst->setRequestFlags(req2);
-                pkt1 = new Packet(req1, mem_req_type);
-                pkt2 = new Packet(req2, mem_req_type);
-                pkt1->dataStatic(&(reinterpret_cast<T*>(
-                    gpuDynInst->d_data))[lane * N]);
-                pkt2->dataStatic(&(reinterpret_cast<T*>(
-                    gpuDynInst->d_data))[lane * N +
-                                         req1->getSize()/sizeof(T)]);
-                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "
-                        "request for %#x\n", gpuDynInst->cu_id,
-                        gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
-                        split_addr);
-                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
-                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
-            } else {
-                gpuDynInst->setStatusVector(lane, 1);
-                gpuDynInst->setRequestFlags(req);
-                pkt = new Packet(req, mem_req_type);
-                pkt->dataStatic(&(reinterpret_cast<T*>(
-                    gpuDynInst->d_data))[lane * N]);
-                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
-            }
-        } else { // if lane is not active, then no pending requests
-            gpuDynInst->setStatusVector(lane, 0);
-        }
-    }
-}
-
-/**
- * Helper function for scalar instructions declared in op_encodings.  This
- * function takes in all of the arguments for a given memory request we are
- * trying to initialize, then submits the request or requests depending on if
- * the original request is aligned or unaligned.
- */
-template<typename T, int N>
-inline void
-initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
-{
-    int req_size = N * sizeof(T);
-    int block_size = gpuDynInst->computeUnit()->cacheLineSize();
-    Addr vaddr = gpuDynInst->scalarAddr;
-
-    /**
-     * the base address of the cache line where the the last byte of
-     * the request will be stored.
-     */
-    Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
-
-    assert(split_addr <= vaddr || split_addr - vaddr < block_size);
-    /**
-     * if the base cache line address of the last byte is greater
-     * than the address of the first byte then we have a misaligned
-     * access.
-     */
-    bool misaligned_acc = split_addr > vaddr;
-
-    RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
-                                 gpuDynInst->computeUnit()->requestorId(), 0,
-                                 gpuDynInst->wfDynId);
-
-    if (misaligned_acc) {
-        RequestPtr req1, req2;
-        req->splitOnVaddr(split_addr, req1, req2);
-        gpuDynInst->numScalarReqs = 2;
-        gpuDynInst->setRequestFlags(req1);
-        gpuDynInst->setRequestFlags(req2);
-        PacketPtr pkt1 = new Packet(req1, mem_req_type);
-        PacketPtr pkt2 = new Packet(req2, mem_req_type);
-        pkt1->dataStatic(gpuDynInst->scalar_data);
-        pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
-        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"
-                " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, split_addr);
-        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
-        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
-    } else {
-        gpuDynInst->numScalarReqs = 1;
-        gpuDynInst->setRequestFlags(req);
-        PacketPtr pkt = new Packet(req, mem_req_type);
-        pkt->dataStatic(gpuDynInst->scalar_data);
-        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
-    }
-}
-
-} // namespace gem5
-
-#endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__
diff --git a/src/arch/amdgpu/gcn3/gpu_registers.hh b/src/arch/amdgpu/gcn3/gpu_registers.hh
deleted file mode 100644
index 7f1307f372..0000000000
--- a/src/arch/amdgpu/gcn3/gpu_registers.hh
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_REGISTERS_HH__
-#define __ARCH_GCN3_REGISTERS_HH__
-
-#include <array>
-#include <cstdint>
-#include <string>
-
-#include "arch/generic/vec_reg.hh"
-#include "base/intmath.hh"
-#include "base/logging.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    enum OpSelector : int
-    {
-        REG_SGPR_MIN = 0,
-        REG_SGPR_MAX = 101,
-        REG_FLAT_SCRATCH_LO = 102,
-        REG_FLAT_SCRATCH_HI = 103,
-        REG_XNACK_MASK_LO = 104,
-        REG_XNACK_MASK_HI = 105,
-        REG_VCC_LO = 106,
-        REG_VCC_HI = 107,
-        REG_TBA_LO = 108,
-        REG_TBA_HI = 109,
-        REG_TMA_LO = 110,
-        REG_TMA_HI = 111,
-        REG_TTMP_0 = 112,
-        REG_TTMP_1 = 113,
-        REG_TTMP_2 = 114,
-        REG_TTMP_3 = 115,
-        REG_TTMP_4 = 116,
-        REG_TTMP_5 = 117,
-        REG_TTMP_6 = 118,
-        REG_TTMP_7 = 119,
-        REG_TTMP_8 = 120,
-        REG_TTMP_9 = 121,
-        REG_TTMP_10 = 122,
-        REG_TTMP_11 = 123,
-        REG_M0 = 124,
-        REG_RESERVED_1 = 125,
-        REG_EXEC_LO = 126,
-        REG_EXEC_HI = 127,
-        REG_ZERO = 128,
-        REG_INT_CONST_POS_MIN = 129,
-        REG_INT_CONST_POS_MAX = 192,
-        REG_INT_CONST_NEG_MIN = 193,
-        REG_INT_CONST_NEG_MAX = 208,
-        REG_RESERVED_2 = 209,
-        REG_RESERVED_3 = 210,
-        REG_RESERVED_4 = 211,
-        REG_RESERVED_5 = 212,
-        REG_RESERVED_6 = 213,
-        REG_RESERVED_7 = 214,
-        REG_RESERVED_8 = 215,
-        REG_RESERVED_9 = 216,
-        REG_RESERVED_10 = 217,
-        REG_RESERVED_11 = 218,
-        REG_RESERVED_12 = 219,
-        REG_RESERVED_13 = 220,
-        REG_RESERVED_14 = 221,
-        REG_RESERVED_15 = 222,
-        REG_RESERVED_16 = 223,
-        REG_RESERVED_17 = 224,
-        REG_RESERVED_18 = 225,
-        REG_RESERVED_19 = 226,
-        REG_RESERVED_20 = 227,
-        REG_RESERVED_21 = 228,
-        REG_RESERVED_22 = 229,
-        REG_RESERVED_23 = 230,
-        REG_RESERVED_24 = 231,
-        REG_RESERVED_25 = 232,
-        REG_RESERVED_26 = 233,
-        REG_RESERVED_27 = 234,
-        REG_RESERVED_28 = 235,
-        REG_RESERVED_29 = 236,
-        REG_RESERVED_30 = 237,
-        REG_RESERVED_31 = 238,
-        REG_RESERVED_32 = 239,
-        REG_POS_HALF = 240,
-        REG_NEG_HALF = 241,
-        REG_POS_ONE = 242,
-        REG_NEG_ONE = 243,
-        REG_POS_TWO = 244,
-        REG_NEG_TWO = 245,
-        REG_POS_FOUR = 246,
-        REG_NEG_FOUR = 247,
-        REG_PI = 248,
-        /* NOTE: SDWA and SWDA both refer to sub d-word addressing */
-        REG_SRC_SWDA = 249,
-        REG_SRC_DPP = 250,
-        REG_VCCZ = 251,
-        REG_EXECZ = 252,
-        REG_SCC = 253,
-        REG_LDS_DIRECT = 254,
-        REG_SRC_LITERAL = 255,
-        REG_VGPR_MIN = 256,
-        REG_VGPR_MAX = 511
-    };
-
-    constexpr size_t MaxOperandDwords(16);
-    const int NumVecElemPerVecReg(64);
-    // op selector values 129 - 192 correspond to const values 1 - 64
-    const int NumPosConstRegs = REG_INT_CONST_POS_MAX
-        - REG_INT_CONST_POS_MIN + 1;
-    // op selector values 193 - 208 correspond to const values -1 - 16
-    const int NumNegConstRegs = REG_INT_CONST_NEG_MAX
-        - REG_INT_CONST_NEG_MIN + 1;
-    const int BITS_PER_BYTE = 8;
-    const int BITS_PER_WORD = 16;
-    const int MSB_PER_BYTE = (BITS_PER_BYTE - 1);
-    const int MSB_PER_WORD = (BITS_PER_WORD - 1);
-
-    // typedefs for the various sizes/types of scalar regs
-    typedef uint8_t ScalarRegU8;
-    typedef int8_t ScalarRegI8;
-    typedef uint16_t ScalarRegU16;
-    typedef int16_t ScalarRegI16;
-    typedef uint32_t ScalarRegU32;
-    typedef int32_t ScalarRegI32;
-    typedef float ScalarRegF32;
-    typedef uint64_t ScalarRegU64;
-    typedef int64_t ScalarRegI64;
-    typedef double ScalarRegF64;
-
-    // typedefs for the various sizes/types of vector reg elements
-    typedef uint8_t VecElemU8;
-    typedef int8_t VecElemI8;
-    typedef uint16_t VecElemU16;
-    typedef int16_t VecElemI16;
-    typedef uint32_t VecElemU32;
-    typedef int32_t VecElemI32;
-    typedef float VecElemF32;
-    typedef uint64_t VecElemU64;
-    typedef int64_t VecElemI64;
-    typedef double VecElemF64;
-
-    const int DWordSize = sizeof(VecElemU32);
-    /**
-     * Size of a single-precision register in DWords.
-     */
-    const int RegSizeDWords = sizeof(VecElemU32) / DWordSize;
-
-    using VecRegContainerU32 =
-        VecRegContainer<sizeof(VecElemU32) * NumVecElemPerVecReg>;
-
-    struct StatusReg
-    {
-        StatusReg() : SCC(0), SPI_PRIO(0), USER_PRIO(0), PRIV(0), TRAP_EN(0),
-            TTRACE_EN(0), EXPORT_RDY(0), EXECZ(0), VCCZ(0), IN_TG(0),
-            IN_BARRIER(0), HALT(0), TRAP(0), TTRACE_CU_EN(0), VALID(0),
-            ECC_ERR(0), SKIP_EXPORT(0), PERF_EN(0), COND_DBG_USER(0),
-            COND_DBG_SYS(0), ALLOW_REPLAY(0), INSTRUCTION_ATC(0), RESERVED(0),
-            MUST_EXPORT(0), RESERVED_1(0)
-        {
-        }
-
-        uint32_t SCC : 1;
-        uint32_t SPI_PRIO : 2;
-        uint32_t USER_PRIO : 2;
-        uint32_t PRIV : 1;
-        uint32_t TRAP_EN : 1;
-        uint32_t TTRACE_EN : 1;
-        uint32_t EXPORT_RDY : 1;
-        uint32_t EXECZ : 1;
-        uint32_t VCCZ : 1;
-        uint32_t IN_TG : 1;
-        uint32_t IN_BARRIER : 1;
-        uint32_t HALT : 1;
-        uint32_t TRAP : 1;
-        uint32_t TTRACE_CU_EN : 1;
-        uint32_t VALID : 1;
-        uint32_t ECC_ERR : 1;
-        uint32_t SKIP_EXPORT : 1;
-        uint32_t PERF_EN : 1;
-        uint32_t COND_DBG_USER : 1;
-        uint32_t COND_DBG_SYS : 1;
-        uint32_t ALLOW_REPLAY : 1;
-        uint32_t INSTRUCTION_ATC : 1;
-        uint32_t RESERVED : 3;
-        uint32_t MUST_EXPORT : 1;
-        uint32_t RESERVED_1 : 4;
-    };
-
-    std::string opSelectorToRegSym(int opIdx, int numRegs=0);
-    int opSelectorToRegIdx(int opIdx, int numScalarRegs);
-    bool isPosConstVal(int opIdx);
-    bool isNegConstVal(int opIdx);
-    bool isConstVal(int opIdx);
-    bool isLiteral(int opIdx);
-    bool isScalarReg(int opIdx);
-    bool isVectorReg(int opIdx);
-    bool isFlatScratchReg(int opIdx);
-    bool isExecMask(int opIdx);
-    bool isVccReg(int opIdx);
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif // __ARCH_GCN3_REGISTERS_HH__
diff --git a/src/arch/amdgpu/gcn3/insts/gpu_static_inst.hh b/src/arch/amdgpu/gcn3/insts/gpu_static_inst.hh
deleted file mode 100644
index aa742f2e3a..0000000000
--- a/src/arch/amdgpu/gcn3/insts/gpu_static_inst.hh
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_INSTS_GPU_STATIC_INST_HH__
-#define __ARCH_GCN3_INSTS_GPU_STATIC_INST_HH__
-
-#include "arch/amdgpu/gcn3/gpu_registers.hh"
-#include "arch/amdgpu/gcn3/operand.hh"
-#include "gpu-compute/gpu_static_inst.hh"
-#include "gpu-compute/scalar_register_file.hh"
-#include "gpu-compute/vector_register_file.hh"
-#include "gpu-compute/wavefront.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    class GCN3GPUStaticInst : public GPUStaticInst
-    {
-      public:
-        GCN3GPUStaticInst(const std::string &opcode);
-        ~GCN3GPUStaticInst();
-
-        void generateDisassembly() override { disassembly = _opcode; }
-
-        bool
-        isFlatScratchRegister(int opIdx) override
-        {
-            return isFlatScratchReg(opIdx);
-        }
-
-        bool
-        isExecMaskRegister(int opIdx) override
-        {
-            return isExecMask(opIdx);
-        }
-
-        void initOperandInfo() override { return; }
-        int getOperandSize(int opIdx) override { return 0; }
-
-        /**
-          * Return the number of tokens needed by the coalescer. In GCN3 there
-          * is generally one packet per memory request per lane generated. In
-          * HSAIL, the number of dest operands is used for loads and src
-          * operands for stores. This method should be overriden on a per-inst
-          * basis when this value differs.
-          */
-        int coalescerTokenCount() const override { return 1; }
-        ScalarRegU32 srcLiteral() const override { return _srcLiteral; }
-
-      protected:
-        void panicUnimplemented() const;
-
-        /**
-         * if the instruction has a src literal - an immediate
-         * value that is part of the instruction stream - we
-         * store that here
-         */
-        ScalarRegU32 _srcLiteral;
-    }; // class GCN3GPUStaticInst
-
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif //__ARCH_GCN3_INSTS_GPU_STATIC_INST_HH__
diff --git a/src/arch/amdgpu/gcn3/insts/inst_util.hh b/src/arch/amdgpu/gcn3/insts/inst_util.hh
deleted file mode 100644
index dfade6ad87..0000000000
--- a/src/arch/amdgpu/gcn3/insts/inst_util.hh
+++ /dev/null
@@ -1,896 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__
-#define __ARCH_GCN3_INSTS_INST_UTIL_HH__
-
-#include <cmath>
-
-#include "arch/amdgpu/gcn3/gpu_registers.hh"
-
-namespace gem5
-{
-
-// values for SDWA select operations
-enum SDWASelVals : int
-{
-    SDWA_BYTE_0 = 0, /* select data[7:0] */
-    SDWA_BYTE_1 = 1, /* select data[15:8] */
-    SDWA_BYTE_2 = 2, /* select data[23:16] */
-    SDWA_BYTE_3 = 3, /* select data[31:24] */
-    SDWA_WORD_0 = 4, /* select data[15:0] */
-    SDWA_WORD_1 = 5, /* select data[31:16] */
-    SDWA_DWORD  = 6  /* select data[31:0] */
-};
-
-// values for format of destination bits for SDWA operations
-enum SDWADstVals : int
-{
-    SDWA_UNUSED_PAD      = 0, /* Pad all unused bits with 0 */
-    SDWA_UNUSED_SEXT     = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */
-    SDWA_UNUSED_PRESERVE = 2  /* select data[31:0] */
-};
-
-// values for DPP operations
-enum SqDPPVals : int
-{
-    SQ_DPP_QUAD_PERM_MAX   = 0xFF,
-    SQ_DPP_RESERVED        = 0x100,
-    SQ_DPP_ROW_SL1         = 0x101,
-    SQ_DPP_ROW_SL15        = 0x10F,
-    SQ_DPP_ROW_SR1         = 0x111,
-    SQ_DPP_ROW_SR15        = 0x11F,
-    SQ_DPP_ROW_RR1         = 0x121,
-    SQ_DPP_ROW_RR15        = 0x12F,
-    SQ_DPP_WF_SL1          = 0x130,
-    SQ_DPP_WF_RL1          = 0x134,
-    SQ_DPP_WF_SR1          = 0x138,
-    SQ_DPP_WF_RR1          = 0x13C,
-    SQ_DPP_ROW_MIRROR      = 0x140,
-    SQ_DPP_ROW_HALF_MIRROR = 0x141,
-    SQ_DPP_ROW_BCAST15     = 0x142,
-    SQ_DPP_ROW_BCAST31     = 0x143
-};
-static const int ROW_SIZE = 16; /* 16 registers per row */
-static const int NUM_BANKS = 4; /* 64 registers, 16/bank */
-
-namespace Gcn3ISA
-{
-    template<typename T>
-    inline T
-    wholeQuadMode(T val)
-    {
-        T wqm = 0;
-        T mask = 0xF;
-
-        for (T bits = val; mask != 0; mask <<= 4)
-            if ((bits & mask) != 0)
-                wqm |= mask;
-
-        return wqm;
-    }
-
-    template<typename T>
-    inline T
-    quadMask(T val)
-    {
-        T qmsk = 0;
-        T mask = 0xF;
-        T qbit = 0x1;
-
-        for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
-            if (bits & mask) {
-                qmsk |= qbit;
-            }
-        }
-
-        return qmsk;
-    }
-
-    template<typename T>
-    inline ScalarRegI32
-    countZeroBits(T val)
-    {
-        ScalarRegI32 num_zeros
-            = std::numeric_limits<T>::digits - popCount(val);
-
-        return num_zeros;
-    }
-
-    template<typename T>
-    inline ScalarRegI32
-    findFirstZero(T val)
-    {
-        if (val == ~T(0)) {
-            return -1;
-        }
-
-        return findLsbSet(~val);
-    }
-
-    template<typename T>
-    inline ScalarRegI32
-    findFirstOne(T val)
-    {
-        if (!val) {
-            return -1;
-        }
-
-        return findLsbSet(val);
-    }
-
-    template<typename T>
-    inline ScalarRegI32
-    findFirstOneMsb(T val)
-    {
-        if (!val) {
-            return -1;
-        }
-
-        return findMsbSet(val);
-    }
-
-    template<typename T>
-    inline ScalarRegI32
-    countZeroBitsMsb(T val)
-    {
-        if (!val) {
-            return -1;
-        }
-
-        return std::numeric_limits<T>::digits - 1 - findMsbSet(val);
-    }
-
-    inline ScalarRegI32
-    firstOppositeSignBit(ScalarRegI32 val)
-    {
-        bool found(false);
-        bool sign_bit = (val & 0x80000000) != 0;
-        ScalarRegU32 tmp_val(0);
-        int count(0);
-
-        if (!val || val == -1) {
-            return -1;
-        }
-
-        for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {
-            tmp_val = val & (0x80000000 >> i);
-
-            if (!sign_bit) {
-                if (tmp_val) {
-                    found = true;
-                    break;
-                }
-            } else {
-                if (!tmp_val) {
-                    found = true;
-                    break;
-                }
-            }
-            ++count;
-        }
-
-        if (found) {
-            return count;
-        } else {
-            return -1;
-        }
-    }
-
-    inline ScalarRegI32
-    firstOppositeSignBit(ScalarRegI64 val)
-    {
-        bool found(false);
-        bool sign_bit = (val & 0x8000000000000000ULL) != 0;
-        ScalarRegU64 tmp_val(0);
-        int count(0);
-
-        if (!val || val == -1) {
-            return -1;
-        }
-
-        for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {
-            tmp_val = val & (0x8000000000000000ULL >> i);
-
-            if (!sign_bit) {
-                if (tmp_val) {
-                    found = true;
-                    break;
-                }
-            } else {
-                if (!tmp_val) {
-                    found = true;
-                    break;
-                }
-            }
-            ++count;
-        }
-
-        if (found) {
-            return count;
-        } else {
-            return -1;
-        }
-    }
-
-    template<typename T>
-    inline T
-    median(T val_0, T val_1, T val_2)
-    {
-        if (std::is_floating_point_v<T>) {
-            return std::fmax(std::fmin(val_0, val_1),
-                std::fmin(std::fmax(val_0, val_1), val_2));
-        } else {
-            return std::max(std::min(val_0, val_1),
-                std::min(std::max(val_0, val_1), val_2));
-        }
-    }
-
-    template <typename T>
-    inline T roundNearestEven(T val)
-    {
-        T int_part = 0;
-        T nearest_round = std::floor(val + 0.5);
-        if ((int)std::floor(val) % 2 == 0
-            && std::modf(std::abs(val), &int_part) == 0.5) {
-          nearest_round = nearest_round - 1;
-        }
-
-        return nearest_round;
-    }
-
-    inline VecElemU32
-    muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1,
-        VecElemU64 val_2)
-    {
-        __uint128_t u0 = (__uint128_t)val_0;
-        __uint128_t u1 = (__uint128_t)val_1;
-        __uint128_t u2 = (__uint128_t)val_2;
-        __uint128_t result = u0 * u1 + u2;
-
-        dst = (VecElemU64)result;
-
-        return (VecElemU32)(result >> 64) ? 1 : 0;
-    }
-
-    inline VecElemU32
-    muladd(VecElemI64 &dst, VecElemI32 val_0, VecElemI32 val_1,
-        VecElemI64 val_2)
-    {
-        __int128_t u0 = (__int128_t)val_0;
-        __int128_t u1 = (__int128_t)val_1;
-        __int128_t u2 = (__int128_t)val_2;
-        __int128_t result = u0 * u1 + u2;
-
-        dst = (VecElemI64)result;
-
-        return (VecElemU32)(result >> 64) ? 1 : 0;
-    }
-
-    /**
-     * dppInstImpl is a helper function that performs the inputted operation
-     * on the inputted vector register lane.  The returned output lane
-     * represents the input lane given the destination lane and DPP_CTRL word.
-     *
-     * Currently the values are:
-     * 0x0 - 0xFF: full permute of four threads
-     * 0x100: reserved
-     * 0x101 - 0x10F: row shift right by 1-15 threads
-     * 0x111 - 0x11F: row shift right by 1-15 threads
-     * 0x121 - 0x12F: row shift right by 1-15 threads
-     * 0x130: wavefront left shift by 1 thread
-     * 0x134: wavefront left rotate by 1 thread
-     * 0x138: wavefront right shift by 1 thread
-     * 0x13C: wavefront right rotate by 1 thread
-     * 0x140: mirror threads within row
-     * 0x141: mirror threads within 1/2 row (8 threads)
-     * 0x142: broadcast 15th thread of each row to next row
-     * 0x143: broadcast thread 31 to rows 2 and 3
-     */
-    int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
-                    int rowOffset, bool & outOfBounds)
-    {
-        // local variables
-        // newLane will be the same as the input lane unless swizzling happens
-        int newLane = currLane;
-        // for shift/rotate permutations; positive values are LEFT rotates
-        int count = 1;
-        int localRowOffset = rowOffset;
-        int localRowNum = rowNum;
-
-        if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}
-            int quadBase = (currLane & ~(3));
-            int quadPix = (currLane & 3);
-            quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
-            newLane = (quadBase | quadPix);
-        } else if (dppCtrl == SQ_DPP_RESERVED) {
-            panic("ERROR: instruction using reserved DPP_CTRL value\n");
-        } else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&
-                   (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}
-            count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1);
-            if ((localRowOffset + count >= 0) &&
-                (localRowOffset + count < ROW_SIZE)) {
-                localRowOffset += count;
-                newLane = (rowNum | localRowOffset);
-            } else {
-                outOfBounds = true;
-            }
-        } else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&
-                   (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}
-            count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1);
-            if ((localRowOffset + count >= 0) &&
-                (localRowOffset + count < ROW_SIZE)) {
-                localRowOffset += count;
-                newLane = (rowNum | localRowOffset);
-            } else {
-                outOfBounds = true;
-            }
-        } else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&
-                   (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}
-            count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1);
-            localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;
-            newLane = (rowNum | localRowOffset);
-        } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1
-            count = 1;
-            if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {
-                newLane += count;
-            } else {
-                outOfBounds = true;
-            }
-        } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1
-            count = 1;
-            newLane = (currLane + count + NumVecElemPerVecReg) %
-                      NumVecElemPerVecReg;
-        } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1
-            count = -1;
-            int currVal = (currLane + count);
-            if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {
-                newLane += count;
-            } else {
-                outOfBounds = true;
-            }
-        } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1
-            count = -1;
-            newLane = (currLane + count + NumVecElemPerVecReg) %
-                      NumVecElemPerVecReg;
-        } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR
-            localRowOffset = (15 - localRowOffset);
-            newLane = (rowNum | localRowOffset);
-        } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR
-            localRowNum = (currLane & -0x7);
-            localRowOffset = (currLane & 0x7);
-            localRowOffset = (7 - localRowNum);
-            newLane = (localRowNum | localRowOffset);
-        } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15
-            count = 15;
-            if (currLane > count) {
-                newLane = (currLane & ~count) - 1;
-            }
-        } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31
-            count = 31;
-            if (currLane > count) {
-                newLane = (currLane & ~count) - 1;
-            }
-        } else {
-            panic("Unimplemented DPP control operation: %d\n", dppCtrl);
-        }
-
-        return newLane;
-    }
-
-    /**
-     * processDPP is a helper function for implementing Data Parallel Primitive
-     * instructions.  This function may be called by many different VOP1
-     * instructions to do operations within a register.
-     */
-    template<typename T>
-    void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
-                    T & src0)
-    {
-        // local variables
-        SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;
-        int boundCtrl = dppInst.BOUND_CTRL;
-        int bankMask = dppInst.BANK_MASK;
-        int rowMask = dppInst.ROW_MASK;
-        // row, bank info to be calculated per lane
-        int rowNum = 0, bankNum = 0, rowOffset = 0;
-        // outLane will be the same as the input lane unless swizzling happens
-        int outLane = 0;
-        bool laneDisabled = false;
-        // flags used for determining if a lane should be written to/reset/etc.
-        bool outOfBounds = false, zeroSrc = false;
-        long long threadValid = 0;
-
-        /**
-         * STEP 1a: check if the absolute value (ABS) or negation (NEG) tags
-         * are set.  If so, do the appropriate action(s) on src0 and/or src1.
-         *
-         * NOTE: ABS takes priority over NEG.
-         */
-        if (dppInst.SRC0_NEG) {
-            src0.negModifier();
-        }
-
-        if (dppInst.SRC0_ABS) {
-            src0.absModifier();
-        }
-
-        // iterate over all register lanes, performing steps 2-4
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            threadValid = (0x1LL << lane);
-            /**
-             * STEP 2: check the row and bank mask values.  These determine
-             * which threads are enabled for the subsequent DPP_CTRL
-             * operations.
-             */
-            rowNum = (lane / ROW_SIZE);
-            rowOffset = (lane % ROW_SIZE);
-            bankNum = (rowOffset / NUM_BANKS);
-
-            if (((rowMask & (0x1 << rowNum)) == 0)   /* row mask */   ||
-                ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {
-                laneDisabled = true;
-                continue;
-            }
-
-            /**
-             * STEP 4: Handle the potential values of DPP_CTRL:
-             * 0x0 - 0xFF: full permute of four threads
-             * 0x100: reserved
-             * 0x101 - 0x10F: row shift right by 1-15 threads
-             * 0x111 - 0x11F: row shift right by 1-15 threads
-             * 0x121 - 0x12F: row shift right by 1-15 threads
-             * 0x130: wavefront left shift by 1 thread
-             * 0x134: wavefront left rotate by 1 thread
-             * 0x138: wavefront right shift by 1 thread
-             * 0x13C: wavefront right rotate by 1 thread
-             * 0x140: mirror threads within row
-             * 0x141: mirror threads within 1/2 row (8 threads)
-             * 0x142: broadcast 15th thread of each row to next row
-             * 0x143: broadcast thread 31 to rows 2 and 3
-             */
-            if (!laneDisabled) {
-                outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
-                                      outOfBounds);
-            }
-
-            /**
-             * STEP 4: Implement bound control for disabled threads.  If thread
-             * is disabled but boundCtrl is set, then we need to set the source
-             * data to 0 (i.e., set this lane to 0).
-             */
-            if (laneDisabled) {
-                threadValid = 0;
-            } else if (outOfBounds) {
-                if (boundCtrl == 1) {
-                    zeroSrc = true;
-                } else {
-                    threadValid = 0;
-                }
-            } else if (!gpuDynInst->exec_mask[lane]) {
-                if (boundCtrl == 1) {
-                    zeroSrc = true;
-                } else {
-                    threadValid = 0;
-                }
-            }
-
-            if (threadValid != 0 && !outOfBounds && !zeroSrc) {
-                assert(!laneDisabled);
-                src0[outLane] = src0[lane];
-            } else if (zeroSrc) {
-                src0[lane] = 0;
-            }
-
-            // reset for next iteration
-            laneDisabled = false;
-        }
-    }
-
-    /**
-     * processDPP is a helper function for implementing Data Parallel Primitive
-     * instructions.  This function may be called by many different
-     * VOP2/VOPC instructions to do operations within a register.
-     */
-    template<typename T>
-    void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
-                    T & src0, T & src1)
-    {
-        /**
-         * STEP 1b: check if the absolute value (ABS) or negation (NEG) tags
-         * are set.  If so, do the appropriate action(s) on src0 and/or src1.
-         *
-         * NOTE: ABS takes priority over NEG.
-         */
-        if (dppInst.SRC1_NEG) {
-            src1.negModifier();
-        }
-
-        if (dppInst.SRC1_ABS) {
-            src1.absModifier();
-        }
-
-        // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,
-        // which is only used for negation/absolute value, call other version
-        // to do everything else.
-        processDPP(gpuDynInst, dppInst, src0);
-    }
-
-    /**
-     * sdwaInstSrcImpl_helper contains the per-lane code for selecting the
-     * appropriate bytes/words of the lane and doing the appropriate
-     * masking/padding/sign extending.  It returns the value after these
-     * operations are done on it.
-     */
-    template<typename T>
-    T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
-                             const SDWASelVals sel, const bool signExt)
-    {
-        // local variables
-        int low_bit = 0, high_bit = 0;
-        bool signExt_local = signExt;
-        T retVal = 0;
-
-        // if we're preserving all of the bits, then we can immediately return
-        if (sel == SDWA_DWORD) {
-            return currOperVal;
-        }
-
-        if (sel < SDWA_WORD_0) { // we are selecting 1 byte
-            /*
-              Process byte 0 first.  This code eiter selects the original bits
-              of byte 0, or makes the bits of the selected byte be byte 0 (and
-              next either sign extends or zero's out upper bits).
-            */
-            low_bit = (sel * Gcn3ISA::BITS_PER_BYTE);
-            high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;
-            retVal = bits(currOperVal, high_bit, low_bit);
-
-            // make sure update propagated, since used next
-            fatal_if(bits(retVal, Gcn3ISA::MSB_PER_BYTE) !=
-                     bits(origOperVal, high_bit),
-                     "ERROR: SDWA byte update not propagated: retVal: %d, "
-                     "orig: %d\n", bits(retVal, Gcn3ISA::MSB_PER_BYTE),
-                     bits(origOperVal, high_bit));
-            // sign extended value depends on upper-most bit of the new byte 0
-            signExt_local = (signExt &&
-                             (bits(retVal, Gcn3ISA::MSB_PER_BYTE, 0) & 0x80));
-
-            // process all other bytes -- if sign extending, make them 1, else
-            // all 0's so leave as is
-            if (signExt_local) {
-                retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);
-            }
-        } else if (sel < SDWA_DWORD) { // we are selecting 1 word
-            /*
-              Process word 0 first.  This code eiter selects the original bits
-              of word 0, or makes the bits of the selected word be word 0 (and
-              next either sign extends or zero's out upper bits).
-            */
-            low_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD;
-            high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
-            retVal = bits(currOperVal, high_bit, low_bit);
-
-            // make sure update propagated, since used next
-            fatal_if(bits(retVal, Gcn3ISA::MSB_PER_WORD) !=
-                     bits(origOperVal, high_bit),
-                     "ERROR: SDWA word update not propagated: retVal: %d, "
-                     "orig: %d\n",
-                     bits(retVal, Gcn3ISA::MSB_PER_WORD),
-                     bits(origOperVal, high_bit));
-            // sign extended value depends on upper-most bit of the new word 0
-            signExt_local = (signExt &&
-                             (bits(retVal, Gcn3ISA::MSB_PER_WORD, 0) &
-                              0x8000));
-
-            // process other word -- if sign extending, make them 1, else all
-            // 0's so leave as is
-            if (signExt_local) {
-                retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);
-            }
-        } else {
-            assert(sel != SDWA_DWORD); // should have returned earlier
-            panic("Unimplemented SDWA select operation: %d\n", sel);
-        }
-
-        return retVal;
-    }
-
-
-    /**
-     * sdwaInstSrcImpl is a helper function that selects the appropriate
-     * bits/bytes for each lane of the inputted source operand of an SDWA
-     * instruction, does the appropriate masking/padding/sign extending for the
-     * non-selected bits/bytes, and updates the operands values with the
-     * resultant value.
-     *
-     * The desired behavior is:
-     *   1.  Select the appropriate bits/bytes based on sel:
-     *       0 (SDWA_BYTE_0): select data[7:0]
-     *       1 (SDWA_BYTE_1): select data[15:8]
-     *       2 (SDWA_BYTE_2): select data[23:16]
-     *       3 (SDWA_BYTE_3): select data[31:24]
-     *       4 (SDWA_WORD_0): select data[15:0]
-     *       5 (SDWA_WORD_1): select data[31:16]
-     *       6 (SDWA_DWORD): select data[31:0]
-     *   2.  if sign extend is set, then sign extend the value
-     */
-    template<typename T>
-    void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
-                         const SDWASelVals sel, const bool signExt)
-    {
-        // iterate over all lanes, setting appropriate, selected value
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
-                                                    origCurrOper[lane], sel,
-                                                    signExt);
-        }
-    }
-
-
-    /**
-     * sdwaInstDstImpl_helper contains the per-lane code for selecting the
-     * appropriate bytes/words of the lane and doing the appropriate
-     * masking/padding/sign extending.  It returns the value after these
-     * operations are done on it.
-     */
-    template<typename T>
-    T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
-                             const bool clamp, const SDWASelVals sel,
-                             const SDWADstVals unusedBits_format)
-    {
-        // local variables
-        int low_bit = 0, high_bit = 0;
-        bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
-        //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
-        bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
-        T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
-          origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
-
-        // if we're preserving all of the bits, then we can immediately return
-        if (unusedBits_format == SDWA_UNUSED_PRESERVE) {
-            assert(sel == SDWA_DWORD);
-            return currDstVal;
-        } else if (sel == SDWA_DWORD) {
-            // NOTE: users may set the unused bits variable to anything in this
-            // scenario, because it will be ignored
-            return currDstVal;
-        }
-
-        if (sel < SDWA_WORD_0) { // we are selecting 1 byte
-            // if we sign extended depends on upper-most bit of byte 0
-            signExt = (signExt &&
-                       (bits(currDstVal, Gcn3ISA::MSB_PER_WORD, 0) & 0x80));
-
-            for (int byte = 0; byte < 4; ++byte) {
-                low_bit = byte * Gcn3ISA::BITS_PER_BYTE;
-                high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;
-                /*
-                  Options:
-                    1.  byte == sel: we are keeping all bits in this byte
-                    2.  preserve is set: keep this byte as is because the
-                    output preserve flag is set
-                    3.  byte > sel && signExt: we're sign extending and
-                    this byte is one of the bytes we need to sign extend
-                */
-                origBits_thisByte = bits(origDstVal, high_bit, low_bit);
-                currBits_thisByte = bits(currDstVal, high_bit, low_bit);
-                newBits = ((byte == sel) ? origBits_thisByte :
-                           ((preserve) ? currBits_thisByte :
-                            (((byte > sel) && signExt) ? 0xff : 0)));
-                retVal = insertBits(retVal, high_bit, low_bit, newBits);
-            }
-        } else if (sel < SDWA_DWORD) { // we are selecting 1 word
-            low_bit = 0;
-            high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
-            // if we sign extended depends on upper-most bit of word 0
-            signExt = (signExt &&
-                       (bits(currDstVal, high_bit, low_bit) & 0x8000));
-
-            for (int word = 0; word < 2; ++word) {
-                low_bit = word * Gcn3ISA::BITS_PER_WORD;
-                high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
-                /*
-                  Options:
-                    1.  word == sel & 1: we are keeping all bits in this word
-                    2.  preserve is set: keep this word as is because the
-                    output preserve flag is set
-                    3.  word > (sel & 1) && signExt: we're sign extending and
-                    this word is one of the words we need to sign extend
-                */
-                origBits_thisWord = bits(origDstVal, high_bit, low_bit);
-                currBits_thisWord = bits(currDstVal, high_bit, low_bit);
-                newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
-                           ((preserve) ? currBits_thisWord :
-                            (((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
-                retVal = insertBits(retVal, high_bit, low_bit, newBits);
-            }
-        } else {
-            assert(sel != SDWA_DWORD); // should have returned earlier
-            panic("Unimplemented SDWA select operation: %d\n", sel);
-        }
-
-        return retVal;
-    }
-
-
-    /**
-     * sdwaInstDestImpl is a helper function that selects the appropriate
-     * bits/bytes for the inputted dest operand of an SDWA instruction, does
-     * the appropriate masking/padding/sign extending for the non-selected
-     * bits/bytes, and updates the operands values with the resultant value.
-     *
-     * The desired behavior is:
-     *   1.  Select the appropriate bits/bytes based on sel:
-     *       0 (SDWA_BYTE_0): select data[7:0]
-     *       1 (SDWA_BYTE_1): select data[15:8]
-     *       2 (SDWA_BYTE_2): select data[23:16]
-     *       3 (SDWA_BYTE_3): select data[31:24]
-     *       4 (SDWA_WORD_0): select data[15:0]
-     *       5 (SDWA_WORD_1): select data[31:16]
-     *       6 (SDWA_DWORD): select data[31:0]
-     *   2.  either pad, sign extend, or select all bits based on the value of
-     *   unusedBits_format:
-     *       0 (SDWA_UNUSED_PAD): pad all unused bits with 0
-     *       1 (SDWA_UNUSED_SEXT): sign-extend upper bits; pad lower bits w/ 0
-     *       2 (SDWA_UNUSED_PRESERVE): select data[31:0]
-     */
-    template<typename T>
-    void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
-                         const SDWASelVals sel,
-                         const SDWADstVals unusedBits_format)
-    {
-        // iterate over all lanes, setting appropriate, selected value
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
-                                                   origDstOper[lane], clamp,
-                                                   sel, unusedBits_format);
-        }
-    }
-
-
-    /**
-     * processSDWA_srcHelper is a helper function for implementing sub d-word
-     * addressing instructions for the src operands.  This function may be
-     * called by many different VOP1/VOP2/VOPC instructions to do operations
-     * within a register.  This function is also agnostic of which operand it
-     * is operating on, so that it can be called for any src operand.
-     */
-    template<typename T>
-    void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
-                                const SDWASelVals src_sel,
-                                const bool src_signExt, const bool src_abs,
-                                const bool src_neg)
-    {
-        /**
-         * STEP 1: check if the absolute value (ABS) or negation (NEG) tags
-         * are set.  If so, do the appropriate action(s) on the src operand.
-         *
-         * NOTE: According to the CSim implementation, ABS takes priority over
-         * NEG.
-         */
-        if (src_neg) {
-            currSrc.negModifier();
-        }
-
-        if (src_abs) {
-            currSrc.absModifier();
-        }
-
-        /**
-         * STEP 2: select the appropriate bits for each lane of source operand.
-         */
-        sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);
-    }
-
-
-    /**
-     * processSDWA_src is a helper function for implementing sub d-word
-     * addressing instructions for the src operands.  This function may be
-     * called by many different VOP1 instructions to do operations within a
-     * register.  processSDWA_dst is called after the math, while
-     * processSDWA_src is called before the math.
-     */
-    template<typename T>
-    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
-    {
-        // local variables
-        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
-        const bool src0_signExt = sdwaInst.SRC0_SEXT;
-        const bool src0_neg = sdwaInst.SRC0_NEG;
-        const bool src0_abs = sdwaInst.SRC0_ABS;
-
-        // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
-        // operand.  So ensure that SRC1 fields are not set, then call helper
-        // function only on src0.
-        assert(!sdwaInst.SRC1_SEXT);
-        assert(!sdwaInst.SRC1_NEG);
-        assert(!sdwaInst.SRC1_ABS);
-
-        processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
-                               src0_abs, src0_neg);
-    }
-
-
-    /**
-     * processSDWA_src is a helper function for implementing sub d-word
-     * addressing instructions.  This function may be called by many different
-     * VOP2/VOPC instructions to do operations within a register.
-     * processSDWA_dst is called after the math, while processSDWA_src is
-     * called before the math.
-     */
-    template<typename T>
-    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
-                         T & src1, T & origSrc1)
-    {
-        // local variables
-        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
-        const bool src0_signExt = sdwaInst.SRC0_SEXT;
-        const bool src0_neg = sdwaInst.SRC0_NEG;
-        const bool src0_abs = sdwaInst.SRC0_ABS;
-        const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
-        const bool src1_signExt = sdwaInst.SRC1_SEXT;
-        const bool src1_neg = sdwaInst.SRC1_NEG;
-        const bool src1_abs = sdwaInst.SRC1_ABS;
-
-        processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
-                               src0_abs, src0_neg);
-        processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,
-                               src1_abs, src1_neg);
-    }
-
-
-    /**
-     * processSDWA_dst is a helper function for implementing sub d-word
-     * addressing instructions for the dst operand.  This function may be
-     * called by many different VOP1/VOP2/VOPC instructions to do operations
-     * within a register.  processSDWA_dst is called after the math, while
-     * processSDWA_src is called before the math.
-     */
-    template<typename T>
-    void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
-    {
-        // local variables
-        const SDWADstVals dst_unusedBits_format =
-            (SDWADstVals)sdwaInst.DST_UNUSED;
-        const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
-        const bool clamp = sdwaInst.CLAMP;
-
-        /**
-         * STEP 1: select the appropriate bits for dst and pad/sign-extend as
-         * appropriate.
-         */
-        sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);
-    }
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__
diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc
deleted file mode 100644
index b9d29a2204..0000000000
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ /dev/null
@@ -1,41675 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "arch/amdgpu/gcn3/insts/instructions.hh"
-
-#include <cmath>
-
-#include "arch/amdgpu/gcn3/insts/inst_util.hh"
-#include "debug/GCN3.hh"
-#include "debug/GPUSync.hh"
-#include "gpu-compute/shader.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-
-    Inst_SOP2__S_ADD_U32::Inst_SOP2__S_ADD_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_add_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADD_U32
-
-    Inst_SOP2__S_ADD_U32::~Inst_SOP2__S_ADD_U32()
-    {
-    } // ~Inst_SOP2__S_ADD_U32
-
-    // D.u = S0.u + S1.u;
-    // SCC = (S0.u + S1.u >= 0x100000000ULL ? 1 : 0) is an unsigned
-    // overflow/carry-out.
-    void
-    Inst_SOP2__S_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() + src1.rawData();
-        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData())
-            >= 0x100000000ULL ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_SUB_U32::Inst_SOP2__S_SUB_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_sub_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUB_U32
-
-    Inst_SOP2__S_SUB_U32::~Inst_SOP2__S_SUB_U32()
-    {
-    } // ~Inst_SOP2__S_SUB_U32
-
-    // D.u = S0.u - S1.u;
-    // SCC = (S1.u > S0.u ? 1 : 0) is an unsigned overflow or carry-out.
-    void
-    Inst_SOP2__S_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() - src1.rawData();
-        scc = (src1.rawData() > src0.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_ADD_I32::Inst_SOP2__S_ADD_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_add_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADD_I32
-
-    Inst_SOP2__S_ADD_I32::~Inst_SOP2__S_ADD_I32()
-    {
-    } // ~Inst_SOP2__S_ADD_I32
-
-    // D.i = S0.i + S1.i;
-    // SCC = (S0.u[31] == S1.u[31] && S0.u[31] != D.u[31]) is a signed
-    // overflow.
-    void
-    Inst_SOP2__S_ADD_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() + src1.rawData();
-        scc = (bits(src0.rawData(), 31) == bits(src1.rawData(), 31)
-            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31))
-            ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_SUB_I32::Inst_SOP2__S_SUB_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_sub_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUB_I32
-
-    Inst_SOP2__S_SUB_I32::~Inst_SOP2__S_SUB_I32()
-    {
-    } // ~Inst_SOP2__S_SUB_I32
-
-    // D.i = S0.i - S1.i;
-    // SCC = (S0.u[31] != S1.u[31] && S0.u[31] != D.u[31]) is a signed
-    // overflow.
-    void
-    Inst_SOP2__S_SUB_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() - src1.rawData();
-        scc = (bits(src0.rawData(), 31) != bits(src1.rawData(), 31)
-            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_ADDC_U32::Inst_SOP2__S_ADDC_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_addc_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADDC_U32
-
-    Inst_SOP2__S_ADDC_U32::~Inst_SOP2__S_ADDC_U32()
-    {
-    } // ~Inst_SOP2__S_ADDC_U32
-
-    // D.u = S0.u + S1.u + SCC;
-    // SCC = (S0.u + S1.u + SCC >= 0x100000000ULL ? 1 : 0) is an unsigned
-    // overflow.
-    void
-    Inst_SOP2__S_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = src0.rawData() + src1.rawData() + scc.rawData();
-        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData()
-            + (ScalarRegU64)scc.rawData()) >= 0x100000000ULL ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_SUBB_U32::Inst_SOP2__S_SUBB_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_subb_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUBB_U32
-
-    Inst_SOP2__S_SUBB_U32::~Inst_SOP2__S_SUBB_U32()
-    {
-    } // ~Inst_SOP2__S_SUBB_U32
-
-    // D.u = S0.u - S1.u - SCC;
-    // SCC = (S1.u + SCC > S0.u ? 1 : 0) is an unsigned overflow.
-    void
-    Inst_SOP2__S_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = src0.rawData() - src1.rawData() - scc.rawData();
-        scc = (src1.rawData() + scc.rawData()) > src0.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_MIN_I32::Inst_SOP2__S_MIN_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_min_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MIN_I32
-
-    Inst_SOP2__S_MIN_I32::~Inst_SOP2__S_MIN_I32()
-    {
-    } // ~Inst_SOP2__S_MIN_I32
-
-    // D.i = (S0.i < S1.i) ? S0.i : S1.i;
-    // SCC = 1 if S0 is chosen as the minimum value.
-    void
-    Inst_SOP2__S_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::min(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_MIN_U32::Inst_SOP2__S_MIN_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_min_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MIN_U32
-
-    Inst_SOP2__S_MIN_U32::~Inst_SOP2__S_MIN_U32()
-    {
-    } // ~Inst_SOP2__S_MIN_U32
-
-    // D.u = (S0.u < S1.u) ? S0.u : S1.u;
-    // SCC = 1 if S0 is chosen as the minimum value.
-    void
-    Inst_SOP2__S_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::min(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_MAX_I32::Inst_SOP2__S_MAX_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_max_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MAX_I32
-
-    Inst_SOP2__S_MAX_I32::~Inst_SOP2__S_MAX_I32()
-    {
-    } // ~Inst_SOP2__S_MAX_I32
-
-    // D.i = (S0.i > S1.i) ? S0.i : S1.i;
-    // SCC = 1 if S0 is chosen as the maximum value.
-    void
-    Inst_SOP2__S_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::max(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_MAX_U32::Inst_SOP2__S_MAX_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_max_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MAX_U32
-
-    Inst_SOP2__S_MAX_U32::~Inst_SOP2__S_MAX_U32()
-    {
-    } // ~Inst_SOP2__S_MAX_U32
-
-    // D.u = (S0.u > S1.u) ? S0.u : S1.u;
-    // SCC = 1 if S0 is chosen as the maximum value.
-    void
-    Inst_SOP2__S_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::max(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_CSELECT_B32::Inst_SOP2__S_CSELECT_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cselect_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_CSELECT_B32
-
-    Inst_SOP2__S_CSELECT_B32::~Inst_SOP2__S_CSELECT_B32()
-    {
-    } // ~Inst_SOP2__S_CSELECT_B32
-
-    // D.u = SCC ? S0.u : S1.u (conditional select).
-    void
-    Inst_SOP2__S_CSELECT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
-
-        sdst.write();
-    }
-
-    Inst_SOP2__S_CSELECT_B64::Inst_SOP2__S_CSELECT_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cselect_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_CSELECT_B64
-
-    Inst_SOP2__S_CSELECT_B64::~Inst_SOP2__S_CSELECT_B64()
-    {
-    } // ~Inst_SOP2__S_CSELECT_B64
-
-    // D.u64 = SCC ? S0.u64 : S1.u64 (conditional select).
-    void
-    Inst_SOP2__S_CSELECT_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
-
-        sdst.write();
-    }
-
-    Inst_SOP2__S_AND_B32::Inst_SOP2__S_AND_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_and_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_AND_B32
-
-    Inst_SOP2__S_AND_B32::~Inst_SOP2__S_AND_B32()
-    {
-    } // ~Inst_SOP2__S_AND_B32
-
-    // D.u = S0.u & S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() & src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_AND_B64::Inst_SOP2__S_AND_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_and_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_AND_B64
-
-    Inst_SOP2__S_AND_B64::~Inst_SOP2__S_AND_B64()
-    {
-    } // ~Inst_SOP2__S_AND_B64
-
-    // D.u64 = S0.u64 & S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_AND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() & src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_OR_B32::Inst_SOP2__S_OR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_or_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_OR_B32
-
-    Inst_SOP2__S_OR_B32::~Inst_SOP2__S_OR_B32()
-    {
-    } // ~Inst_SOP2__S_OR_B32
-
-    // D.u = S0.u | S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() | src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_OR_B64::Inst_SOP2__S_OR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_or_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_OR_B64
-
-    Inst_SOP2__S_OR_B64::~Inst_SOP2__S_OR_B64()
-    {
-    } // ~Inst_SOP2__S_OR_B64
-
-    // D.u64 = S0.u64 | S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_OR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() | src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_XOR_B32::Inst_SOP2__S_XOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XOR_B32
-
-    Inst_SOP2__S_XOR_B32::~Inst_SOP2__S_XOR_B32()
-    {
-    } // ~Inst_SOP2__S_XOR_B32
-
-    // D.u = S0.u ^ S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() ^ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_XOR_B64::Inst_SOP2__S_XOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XOR_B64
-
-    Inst_SOP2__S_XOR_B64::~Inst_SOP2__S_XOR_B64()
-    {
-    } // ~Inst_SOP2__S_XOR_B64
-
-    // D.u64 = S0.u64 ^ S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() ^ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_ANDN2_B32::Inst_SOP2__S_ANDN2_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_andn2_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ANDN2_B32
-
-    Inst_SOP2__S_ANDN2_B32::~Inst_SOP2__S_ANDN2_B32()
-    {
-    } // ~Inst_SOP2__S_ANDN2_B32
-
-    // D.u = S0.u & ~S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ANDN2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() &~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_ANDN2_B64::Inst_SOP2__S_ANDN2_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_andn2_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ANDN2_B64
-
-    Inst_SOP2__S_ANDN2_B64::~Inst_SOP2__S_ANDN2_B64()
-    {
-    } // ~Inst_SOP2__S_ANDN2_B64
-
-    // D.u64 = S0.u64 & ~S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ANDN2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() &~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_ORN2_B32::Inst_SOP2__S_ORN2_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_orn2_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ORN2_B32
-
-    Inst_SOP2__S_ORN2_B32::~Inst_SOP2__S_ORN2_B32()
-    {
-    } // ~Inst_SOP2__S_ORN2_B32
-
-    // D.u = S0.u | ~S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ORN2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() |~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_ORN2_B64::Inst_SOP2__S_ORN2_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_orn2_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ORN2_B64
-
-    Inst_SOP2__S_ORN2_B64::~Inst_SOP2__S_ORN2_B64()
-    {
-    } // ~Inst_SOP2__S_ORN2_B64
-
-    // D.u64 = S0.u64 | ~S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ORN2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() |~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_NAND_B32::Inst_SOP2__S_NAND_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nand_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NAND_B32
-
-    Inst_SOP2__S_NAND_B32::~Inst_SOP2__S_NAND_B32()
-    {
-    } // ~Inst_SOP2__S_NAND_B32
-
-    // D.u = ~(S0.u & S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NAND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() & src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_NAND_B64::Inst_SOP2__S_NAND_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nand_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NAND_B64
-
-    Inst_SOP2__S_NAND_B64::~Inst_SOP2__S_NAND_B64()
-    {
-    } // ~Inst_SOP2__S_NAND_B64
-
-    // D.u64 = ~(S0.u64 & S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NAND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() & src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_NOR_B32::Inst_SOP2__S_NOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NOR_B32
-
-    Inst_SOP2__S_NOR_B32::~Inst_SOP2__S_NOR_B32()
-    {
-    } // ~Inst_SOP2__S_NOR_B32
-
-    // D.u = ~(S0.u | S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() | src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_NOR_B64::Inst_SOP2__S_NOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NOR_B64
-
-    Inst_SOP2__S_NOR_B64::~Inst_SOP2__S_NOR_B64()
-    {
-    } // ~Inst_SOP2__S_NOR_B64
-
-    // D.u64 = ~(S0.u64 | S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() | src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_XNOR_B32::Inst_SOP2__S_XNOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xnor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XNOR_B32
-
-    Inst_SOP2__S_XNOR_B32::~Inst_SOP2__S_XNOR_B32()
-    {
-    } // ~Inst_SOP2__S_XNOR_B32
-
-    // D.u = ~(S0.u ^ S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() ^ src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_XNOR_B64::Inst_SOP2__S_XNOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xnor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XNOR_B64
-
-    Inst_SOP2__S_XNOR_B64::~Inst_SOP2__S_XNOR_B64()
-    {
-    } // ~Inst_SOP2__S_XNOR_B64
-
-    // D.u64 = ~(S0.u64 ^ S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XNOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() ^ src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_LSHL_B32::Inst_SOP2__S_LSHL_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshl_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHL_B32
-
-    Inst_SOP2__S_LSHL_B32::~Inst_SOP2__S_LSHL_B32()
-    {
-    } // ~Inst_SOP2__S_LSHL_B32
-
-    // D.u = S0.u << S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_LSHL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() << bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_LSHL_B64::Inst_SOP2__S_LSHL_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshl_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHL_B64
-
-    Inst_SOP2__S_LSHL_B64::~Inst_SOP2__S_LSHL_B64()
-    {
-    } // ~Inst_SOP2__S_LSHL_B64
-
-    // D.u64 = S0.u64 << S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_LSHL_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() << bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_LSHR_B32::Inst_SOP2__S_LSHR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshr_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHR_B32
-
-    Inst_SOP2__S_LSHR_B32::~Inst_SOP2__S_LSHR_B32()
-    {
-    } // ~Inst_SOP2__S_LSHR_B32
-
-    // D.u = S0.u >> S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to zero.
-    void
-    Inst_SOP2__S_LSHR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_LSHR_B64::Inst_SOP2__S_LSHR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshr_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHR_B64
-
-    Inst_SOP2__S_LSHR_B64::~Inst_SOP2__S_LSHR_B64()
-    {
-    } // ~Inst_SOP2__S_LSHR_B64
-
-    // D.u64 = S0.u64 >> S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to zero.
-    void
-    Inst_SOP2__S_LSHR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_ASHR_I32::Inst_SOP2__S_ASHR_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_ashr_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ASHR_I32
-
-    Inst_SOP2__S_ASHR_I32::~Inst_SOP2__S_ASHR_I32()
-    {
-    } // ~Inst_SOP2__S_ASHR_I32
-
-    // D.i = signext(S0.i) >> S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_SOP2__S_ASHR_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_ASHR_I64::Inst_SOP2__S_ASHR_I64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_ashr_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ASHR_I64
-
-    Inst_SOP2__S_ASHR_I64::~Inst_SOP2__S_ASHR_I64()
-    {
-    } // ~Inst_SOP2__S_ASHR_I64
-
-    // D.i64 = signext(S0.i64) >> S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_SOP2__S_ASHR_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_BFM_B32::Inst_SOP2__S_BFM_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfm_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFM_B32
-
-    Inst_SOP2__S_BFM_B32::~Inst_SOP2__S_BFM_B32()
-    {
-    } // ~Inst_SOP2__S_BFM_B32
-
-    // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0] (bitfield mask).
-    void
-    Inst_SOP2__S_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = ((1 << bits(src0.rawData(), 4, 0)) - 1)
-            << bits(src1.rawData(), 4, 0);
-
-        sdst.write();
-    }
-
-    Inst_SOP2__S_BFM_B64::Inst_SOP2__S_BFM_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfm_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFM_B64
-
-    Inst_SOP2__S_BFM_B64::~Inst_SOP2__S_BFM_B64()
-    {
-    } // ~Inst_SOP2__S_BFM_B64
-
-    // D.u64 = ((1ULL << S0.u[5:0]) - 1) << S1.u[5:0] (bitfield mask).
-    void
-    Inst_SOP2__S_BFM_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = ((1ULL << bits(src0.rawData(), 5, 0)) - 1)
-            << bits(src1.rawData(), 5, 0);
-
-        sdst.write();
-    }
-
-    Inst_SOP2__S_MUL_I32::Inst_SOP2__S_MUL_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_mul_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MUL_I32
-
-    Inst_SOP2__S_MUL_I32::~Inst_SOP2__S_MUL_I32()
-    {
-    } // ~Inst_SOP2__S_MUL_I32
-
-    // D.i = S0.i * S1.i.
-    void
-    Inst_SOP2__S_MUL_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() * src1.rawData();
-
-        sdst.write();
-    }
-
-    Inst_SOP2__S_BFE_U32::Inst_SOP2__S_BFE_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_U32
-
-    Inst_SOP2__S_BFE_U32::~Inst_SOP2__S_BFE_U32()
-    {
-    } // ~Inst_SOP2__S_BFE_U32
-
-    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
-    // field width.
-    // D.u = (S0.u >> S1.u[4:0]) & ((1 << S1.u[22:16]) - 1);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_BFE_I32::Inst_SOP2__S_BFE_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_I32
-
-    Inst_SOP2__S_BFE_I32::~Inst_SOP2__S_BFE_I32()
-    {
-    } // ~Inst_SOP2__S_BFE_I32
-
-    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
-    // field width.
-    // D.i = (S0.i >> S1.u[4:0]) & ((1 << S1.u[22:16]) - 1);
-    // Sign-extend the result;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_BFE_U64::Inst_SOP2__S_BFE_U64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_U64
-
-    Inst_SOP2__S_BFE_U64::~Inst_SOP2__S_BFE_U64()
-    {
-    } // ~Inst_SOP2__S_BFE_U64
-
-    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
-    // field width.
-    // D.u64 = (S0.u64 >> S1.u[5:0]) & ((1 << S1.u[22:16]) - 1);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_BFE_I64::Inst_SOP2__S_BFE_I64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_I64
-
-    Inst_SOP2__S_BFE_I64::~Inst_SOP2__S_BFE_I64()
-    {
-    } // ~Inst_SOP2__S_BFE_I64
-
-    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
-    // field width.
-    // D.i64 = (S0.i64 >> S1.u[5:0]) & ((1 << S1.u[22:16]) - 1);
-    // Sign-extend result;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_CBRANCH_G_FORK::Inst_SOP2__S_CBRANCH_G_FORK(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cbranch_g_fork")
-    {
-        setFlag(Branch);
-    } // Inst_SOP2__S_CBRANCH_G_FORK
-
-    Inst_SOP2__S_CBRANCH_G_FORK::~Inst_SOP2__S_CBRANCH_G_FORK()
-    {
-    } // ~Inst_SOP2__S_CBRANCH_G_FORK
-
-    // Conditional branch using branch-stack.
-    // S0 = compare mask(vcc or any sgpr) and
-    // S1 = 64-bit byte address of target instruction.
-    void
-    Inst_SOP2__S_CBRANCH_G_FORK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOP2__S_ABSDIFF_I32::Inst_SOP2__S_ABSDIFF_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_absdiff_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ABSDIFF_I32
-
-    Inst_SOP2__S_ABSDIFF_I32::~Inst_SOP2__S_ABSDIFF_I32()
-    {
-    } // ~Inst_SOP2__S_ABSDIFF_I32
-
-    // D.i = S0.i - S1.i;
-    // if (D.i < 0) then D.i = -D.i;
-    // SCC = 1 if result is non-zero.
-    // Compute the absolute value of difference between two values.
-    void
-    Inst_SOP2__S_ABSDIFF_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        sdst = std::abs(src0.rawData() - src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP2__S_RFE_RESTORE_B64::Inst_SOP2__S_RFE_RESTORE_B64(
-          InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_rfe_restore_b64")
-    {
-    } // Inst_SOP2__S_RFE_RESTORE_B64
-
-    Inst_SOP2__S_RFE_RESTORE_B64::~Inst_SOP2__S_RFE_RESTORE_B64()
-    {
-    } // ~Inst_SOP2__S_RFE_RESTORE_B64
-
-    // Return from exception handler and continue.
-    void
-    Inst_SOP2__S_RFE_RESTORE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPK__S_MOVK_I32::Inst_SOPK__S_MOVK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_movk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_MOVK_I32
-
-    Inst_SOPK__S_MOVK_I32::~Inst_SOPK__S_MOVK_I32()
-    {
-    } // ~Inst_SOPK__S_MOVK_I32
-
-    // D.i = signext(SIMM16) (sign extension).
-    void
-    Inst_SOPK__S_MOVK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        sdst = simm16;
-
-        sdst.write();
-    }
-
-    Inst_SOPK__S_CMOVK_I32::Inst_SOPK__S_CMOVK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmovk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMOVK_I32
-
-    Inst_SOPK__S_CMOVK_I32::~Inst_SOPK__S_CMOVK_I32()
-    {
-    } // ~Inst_SOPK__S_CMOVK_I32
-
-    // if (SCC) then D.i = signext(SIMM16);
-    // else NOP.
-    // Conditional move with sign extension.
-    void
-    Inst_SOPK__S_CMOVK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = simm16;
-            sdst.write();
-        }
-    }
-
-    Inst_SOPK__S_CMPK_EQ_I32::Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_EQ_I32
-
-    Inst_SOPK__S_CMPK_EQ_I32::~Inst_SOPK__S_CMPK_EQ_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_EQ_I32
-
-    // SCC = (S0.i == signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() == simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_CMPK_LG_I32::Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lg_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LG_I32
-
-    Inst_SOPK__S_CMPK_LG_I32::~Inst_SOPK__S_CMPK_LG_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LG_I32
-
-    // SCC = (S0.i != signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LG_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() != simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_CMPK_GT_I32::Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GT_I32
-
-    Inst_SOPK__S_CMPK_GT_I32::~Inst_SOPK__S_CMPK_GT_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GT_I32
-
-    // SCC = (S0.i > signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() > simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_CMPK_GE_I32::Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GE_I32
-
-    Inst_SOPK__S_CMPK_GE_I32::~Inst_SOPK__S_CMPK_GE_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GE_I32
-
-    // SCC = (S0.i >= signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() >= simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_CMPK_LT_I32::Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LT_I32
-
-    Inst_SOPK__S_CMPK_LT_I32::~Inst_SOPK__S_CMPK_LT_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LT_I32
-
-    // SCC = (S0.i < signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() < simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_CMPK_LE_I32::Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LE_I32
-
-    Inst_SOPK__S_CMPK_LE_I32::~Inst_SOPK__S_CMPK_LE_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LE_I32
-
-    // SCC = (S0.i <= signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() <= simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_CMPK_EQ_U32::Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_EQ_U32
-
-    Inst_SOPK__S_CMPK_EQ_U32::~Inst_SOPK__S_CMPK_EQ_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_EQ_U32
-
-    // SCC = (S0.u == SIMM16).
-    void
-    Inst_SOPK__S_CMPK_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() == simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_CMPK_LG_U32::Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lg_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LG_U32
-
-    Inst_SOPK__S_CMPK_LG_U32::~Inst_SOPK__S_CMPK_LG_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LG_U32
-
-    // SCC = (S0.u != SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LG_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() != simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_CMPK_GT_U32::Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GT_U32
-
-    Inst_SOPK__S_CMPK_GT_U32::~Inst_SOPK__S_CMPK_GT_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GT_U32
-
-    // SCC = (S0.u > SIMM16).
-    void
-    Inst_SOPK__S_CMPK_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() > simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_CMPK_GE_U32::Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GE_U32
-
-    Inst_SOPK__S_CMPK_GE_U32::~Inst_SOPK__S_CMPK_GE_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GE_U32
-
-    // SCC = (S0.u >= SIMM16).
-    void
-    Inst_SOPK__S_CMPK_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() >= simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_CMPK_LT_U32::Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LT_U32
-
-    Inst_SOPK__S_CMPK_LT_U32::~Inst_SOPK__S_CMPK_LT_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LT_U32
-
-    // SCC = (S0.u < SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() < simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_CMPK_LE_U32::Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LE_U32
-
-    Inst_SOPK__S_CMPK_LE_U32::~Inst_SOPK__S_CMPK_LE_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LE_U32
-
-    // SCC = (S0.u <= SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() <= simm16) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPK__S_ADDK_I32::Inst_SOPK__S_ADDK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_addk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_ADDK_I32
-
-    Inst_SOPK__S_ADDK_I32::~Inst_SOPK__S_ADDK_I32()
-    {
-    } // ~Inst_SOPK__S_ADDK_I32
-
-    // D.i = D.i + signext(SIMM16);
-    // SCC = overflow.
-    void
-    Inst_SOPK__S_ADDK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = src.rawData() + (ScalarRegI32)sext<16>(simm16);
-        scc = (bits(src.rawData(), 31) == bits(simm16, 15)
-            && bits(src.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOPK__S_MULK_I32::Inst_SOPK__S_MULK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_mulk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_MULK_I32
-
-    Inst_SOPK__S_MULK_I32::~Inst_SOPK__S_MULK_I32()
-    {
-    } // ~Inst_SOPK__S_MULK_I32
-
-    // D.i = D.i * signext(SIMM16).
-    void
-    Inst_SOPK__S_MULK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        sdst.read();
-
-        sdst = sdst.rawData() * (ScalarRegI32)sext<16>(simm16);
-
-        sdst.write();
-    }
-
-    Inst_SOPK__S_CBRANCH_I_FORK::Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cbranch_i_fork")
-    {
-        setFlag(Branch);
-    } // Inst_SOPK__S_CBRANCH_I_FORK
-
-    Inst_SOPK__S_CBRANCH_I_FORK::~Inst_SOPK__S_CBRANCH_I_FORK()
-    {
-    } // ~Inst_SOPK__S_CBRANCH_I_FORK
-
-    // Conditional branch using branch-stack.
-    // S0 = compare mask(vcc or any sgpr), and
-    // SIMM16 = signed DWORD branch offset relative to next instruction.
-    void
-    Inst_SOPK__S_CBRANCH_I_FORK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPK__S_GETREG_B32::Inst_SOPK__S_GETREG_B32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_getreg_b32")
-    {
-    } // Inst_SOPK__S_GETREG_B32
-
-    Inst_SOPK__S_GETREG_B32::~Inst_SOPK__S_GETREG_B32()
-    {
-    } // ~Inst_SOPK__S_GETREG_B32
-
-    // D.u = hardware-reg. Read some or all of a hardware register into the
-    // LSBs of D.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_GETREG_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_setreg_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_SETREG_B32
-
-    Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
-    {
-    } // ~Inst_SOPK__S_SETREG_B32
-
-    // hardware-reg = S0.u. Write some or all of the LSBs of D into a hardware
-    // register.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarRegU32 hwregId = simm16 & 0x3f;
-        ScalarRegU32 offset = (simm16 >> 6) & 31;
-        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
-
-        ScalarOperandU32 hwreg(gpuDynInst, hwregId);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        hwreg.read();
-        sdst.read();
-
-        // Store value from SDST to part of the hardware register.
-        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
-        hwreg = ((hwreg.rawData() & ~mask)
-                        | ((sdst.rawData() << offset) & mask));
-        hwreg.write();
-
-        // set MODE register to control the behavior of single precision
-        // floating-point numbers: denormal mode or round mode
-        if (hwregId==1 && size==2
-                        && (offset==4 || offset==0)) {
-            warn_once("Be cautious that s_setreg_b32 has no real effect "
-                            "on FP modes: %s\n", gpuDynInst->disassemble());
-            return;
-        }
-
-        // panic if not changing MODE of floating-point numbers
-        panicUnimplemented();
-    }
-
-    Inst_SOPK__S_SETREG_IMM32_B32::Inst_SOPK__S_SETREG_IMM32_B32(
-          InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_setreg_imm32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_SETREG_IMM32_B32
-
-    Inst_SOPK__S_SETREG_IMM32_B32::~Inst_SOPK__S_SETREG_IMM32_B32()
-    {
-    } // ~Inst_SOPK__S_SETREG_IMM32_B32
-
-    // Write some or all of the LSBs of IMM32 into a hardware register; this
-    // instruction requires a 32-bit literal constant.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_SETREG_IMM32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarRegU32 hwregId = simm16 & 0x3f;
-        ScalarRegU32 offset = (simm16 >> 6) & 31;
-        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
-
-        ScalarOperandU32 hwreg(gpuDynInst, hwregId);
-        ScalarRegU32 simm32 = extData.imm_u32;
-        hwreg.read();
-
-        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
-        hwreg = ((hwreg.rawData() & ~mask)
-                    | ((simm32 << offset) & mask));
-        hwreg.write();
-
-        if (hwregId==1 && size==2
-                        && (offset==4 || offset==0)) {
-            warn_once("Be cautious that s_setreg_imm32_b32 has no real effect "
-                            "on FP modes: %s\n", gpuDynInst->disassemble());
-            return;
-        }
-
-        // panic if not changing MODE of floating-point numbers
-        panicUnimplemented();
-    }
-
-    Inst_SOP1__S_MOV_B32::Inst_SOP1__S_MOV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_B32
-
-    Inst_SOP1__S_MOV_B32::~Inst_SOP1__S_MOV_B32()
-    {
-    } // ~Inst_SOP1__S_MOV_B32
-
-    // D.u = S0.u.
-    void
-    Inst_SOP1__S_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_MOV_B64::Inst_SOP1__S_MOV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_B64
-
-    Inst_SOP1__S_MOV_B64::~Inst_SOP1__S_MOV_B64()
-    {
-    } // ~Inst_SOP1__S_MOV_B64
-
-    // D.u64 = S0.u64.
-    void
-    Inst_SOP1__S_MOV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_CMOV_B32::Inst_SOP1__S_CMOV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cmov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_CMOV_B32
-
-    Inst_SOP1__S_CMOV_B32::~Inst_SOP1__S_CMOV_B32()
-    {
-    } // ~Inst_SOP1__S_CMOV_B32
-
-    // if (SCC) then D.u = S0.u;
-    // else NOP.
-    // Conditional move.
-    void
-    Inst_SOP1__S_CMOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = src.rawData();
-            sdst.write();
-        }
-    }
-
-    Inst_SOP1__S_CMOV_B64::Inst_SOP1__S_CMOV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cmov_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_CMOV_B64
-
-    Inst_SOP1__S_CMOV_B64::~Inst_SOP1__S_CMOV_B64()
-    {
-    } // ~Inst_SOP1__S_CMOV_B64
-
-    // if (SCC) then D.u64 = S0.u64;
-    // else NOP.
-    // Conditional move.
-    void
-    Inst_SOP1__S_CMOV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = src.rawData();
-            sdst.write();
-        }
-    }
-
-    Inst_SOP1__S_NOT_B32::Inst_SOP1__S_NOT_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_not_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_NOT_B32
-
-    Inst_SOP1__S_NOT_B32::~Inst_SOP1__S_NOT_B32()
-    {
-    } // ~Inst_SOP1__S_NOT_B32
-
-    // D.u = ~S0.u;
-    // SCC = 1 if result is non-zero.
-    // Bitwise negation.
-    void
-    Inst_SOP1__S_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = ~src.rawData();
-
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_NOT_B64::Inst_SOP1__S_NOT_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_not_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_NOT_B64
-
-    Inst_SOP1__S_NOT_B64::~Inst_SOP1__S_NOT_B64()
-    {
-    } // ~Inst_SOP1__S_NOT_B64
-
-    // D.u64 = ~S0.u64;
-    // SCC = 1 if result is non-zero.
-    // Bitwise negation.
-    void
-    Inst_SOP1__S_NOT_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = ~src.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_WQM_B32::Inst_SOP1__S_WQM_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_wqm_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_WQM_B32
-
-    Inst_SOP1__S_WQM_B32::~Inst_SOP1__S_WQM_B32()
-    {
-    } // ~Inst_SOP1__S_WQM_B32
-
-    // Computes whole quad mode for an active/valid mask.
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_WQM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wholeQuadMode(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_WQM_B64::Inst_SOP1__S_WQM_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_wqm_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_WQM_B64
-
-    Inst_SOP1__S_WQM_B64::~Inst_SOP1__S_WQM_B64()
-    {
-    } // ~Inst_SOP1__S_WQM_B64
-
-    // Computes whole quad mode for an active/valid mask.
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_WQM_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wholeQuadMode(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_BREV_B32::Inst_SOP1__S_BREV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_brev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BREV_B32
-
-    Inst_SOP1__S_BREV_B32::~Inst_SOP1__S_BREV_B32()
-    {
-    } // ~Inst_SOP1__S_BREV_B32
-
-    // D.u[31:0] = S0.u[0:31] (reverse bits).
-    void
-    Inst_SOP1__S_BREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = reverseBits(src.rawData());
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_BREV_B64::Inst_SOP1__S_BREV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_brev_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BREV_B64
-
-    Inst_SOP1__S_BREV_B64::~Inst_SOP1__S_BREV_B64()
-    {
-    } // ~Inst_SOP1__S_BREV_B64
-
-    // D.u64[63:0] = S0.u64[0:63] (reverse bits).
-    void
-    Inst_SOP1__S_BREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = reverseBits(src.rawData());
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_BCNT0_I32_B32::Inst_SOP1__S_BCNT0_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt0_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT0_I32_B32
-
-    Inst_SOP1__S_BCNT0_I32_B32::~Inst_SOP1__S_BCNT0_I32_B32()
-    {
-    } // ~Inst_SOP1__S_BCNT0_I32_B32
-
-    // D.i = CountZeroBits(S0.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = countZeroBits(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_BCNT0_I32_B64::Inst_SOP1__S_BCNT0_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt0_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT0_I32_B64
-
-    Inst_SOP1__S_BCNT0_I32_B64::~Inst_SOP1__S_BCNT0_I32_B64()
-    {
-    } // ~Inst_SOP1__S_BCNT0_I32_B64
-
-    // D.i = CountZeroBits(S0.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = countZeroBits(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_BCNT1_I32_B32::Inst_SOP1__S_BCNT1_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt1_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT1_I32_B32
-
-    Inst_SOP1__S_BCNT1_I32_B32::~Inst_SOP1__S_BCNT1_I32_B32()
-    {
-    } // ~Inst_SOP1__S_BCNT1_I32_B32
-
-    // D.i = CountOneBits(S0.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = popCount(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_BCNT1_I32_B64::Inst_SOP1__S_BCNT1_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt1_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT1_I32_B64
-
-    Inst_SOP1__S_BCNT1_I32_B64::~Inst_SOP1__S_BCNT1_I32_B64()
-    {
-    } // ~Inst_SOP1__S_BCNT1_I32_B64
-
-    // D.i = CountOneBits(S0.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = popCount(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_FF0_I32_B32::Inst_SOP1__S_FF0_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff0_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF0_I32_B32
-
-    Inst_SOP1__S_FF0_I32_B32::~Inst_SOP1__S_FF0_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FF0_I32_B32
-
-    // D.i = FindFirstZero(S0.u);
-    // If no zeros are found, return -1.
-    // Returns the bit position of the first zero from the LSB.
-    void
-    Inst_SOP1__S_FF0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstZero(src.rawData());
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_FF0_I32_B64::Inst_SOP1__S_FF0_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff0_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF0_I32_B64
-
-    Inst_SOP1__S_FF0_I32_B64::~Inst_SOP1__S_FF0_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FF0_I32_B64
-
-    // D.i = FindFirstZero(S0.u64);
-    // If no zeros are found, return -1.
-    // Returns the bit position of the first zero from the LSB.
-    void
-    Inst_SOP1__S_FF0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstZero(src.rawData());
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_FF1_I32_B32::Inst_SOP1__S_FF1_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff1_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF1_I32_B32
-
-    Inst_SOP1__S_FF1_I32_B32::~Inst_SOP1__S_FF1_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FF1_I32_B32
-
-    // D.i = FindFirstOne(S0.u);
-    // If no ones are found, return -1.
-    // Returns the bit position of the first one from the LSB.
-    void
-    Inst_SOP1__S_FF1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstOne(src.rawData());
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_FF1_I32_B64::Inst_SOP1__S_FF1_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff1_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF1_I32_B64
-
-    Inst_SOP1__S_FF1_I32_B64::~Inst_SOP1__S_FF1_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FF1_I32_B64
-
-    // D.i = FindFirstOne(S0.u64);
-    // If no ones are found, return -1.
-    // Returns the bit position of the first one from the LSB.
-    void
-    Inst_SOP1__S_FF1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstOne(src.rawData());
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_FLBIT_I32_B32::Inst_SOP1__S_FLBIT_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_B32
-
-    Inst_SOP1__S_FLBIT_I32_B32::~Inst_SOP1__S_FLBIT_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_B32
-
-    // D.i = FindFirstOne(S0.u);
-    // If no ones are found, return -1.
-    // Counts how many zeros before the first one starting from the MSB.
-    void
-    Inst_SOP1__S_FLBIT_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = countZeroBitsMsb(src.rawData());
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_FLBIT_I32_B64::Inst_SOP1__S_FLBIT_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_B64
-
-    Inst_SOP1__S_FLBIT_I32_B64::~Inst_SOP1__S_FLBIT_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_B64
-
-    // D.i = FindFirstOne(S0.u64);
-    // If no ones are found, return -1.
-    // Counts how many zeros before the first one starting from the MSB.
-    void
-    Inst_SOP1__S_FLBIT_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = countZeroBitsMsb(src.rawData());
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_FLBIT_I32::Inst_SOP1__S_FLBIT_I32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32
-
-    Inst_SOP1__S_FLBIT_I32::~Inst_SOP1__S_FLBIT_I32()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32
-
-    // D.i = FirstOppositeSignBit(S0.i);
-    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
-    // Counts how many bits in a row (from MSB to LSB) are the same as the
-    // sign bit.
-    void
-    Inst_SOP1__S_FLBIT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = firstOppositeSignBit(src.rawData());
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_FLBIT_I32_I64::Inst_SOP1__S_FLBIT_I32_I64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_I64
-
-    Inst_SOP1__S_FLBIT_I32_I64::~Inst_SOP1__S_FLBIT_I32_I64()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_I64
-
-    // D.i = FirstOppositeSignBit(S0.i64);
-    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
-    // Counts how many bits in a row (from MSB to LSB) are the same as the
-    // sign bit.
-    void
-    Inst_SOP1__S_FLBIT_I32_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = firstOppositeSignBit(src.rawData());
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_SEXT_I32_I8::Inst_SOP1__S_SEXT_I32_I8(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_sext_i32_i8")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SEXT_I32_I8
-
-    Inst_SOP1__S_SEXT_I32_I8::~Inst_SOP1__S_SEXT_I32_I8()
-    {
-    } // ~Inst_SOP1__S_SEXT_I32_I8
-
-    // D.i = signext(S0.i[7:0]) (sign extension).
-    void
-    Inst_SOP1__S_SEXT_I32_I8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = sext<std::numeric_limits<ScalarRegI8>::digits>(
-            bits(src.rawData(), 7, 0));
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_SEXT_I32_I16::Inst_SOP1__S_SEXT_I32_I16(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_sext_i32_i16")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SEXT_I32_I16
-
-    Inst_SOP1__S_SEXT_I32_I16::~Inst_SOP1__S_SEXT_I32_I16()
-    {
-    } // ~Inst_SOP1__S_SEXT_I32_I16
-
-    // D.i = signext(S0.i[15:0]) (sign extension).
-    void
-    Inst_SOP1__S_SEXT_I32_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = sext<std::numeric_limits<ScalarRegI16>::digits>(
-            bits(src.rawData(), 15, 0));
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_BITSET0_B32::Inst_SOP1__S_BITSET0_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset0_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET0_B32
-
-    Inst_SOP1__S_BITSET0_B32::~Inst_SOP1__S_BITSET0_B32()
-    {
-    } // ~Inst_SOP1__S_BITSET0_B32
-
-    // D.u[S0.u[4:0]] = 0.
-    void
-    Inst_SOP1__S_BITSET0_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 4, 0), 0);
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_BITSET0_B64::Inst_SOP1__S_BITSET0_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset0_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET0_B64
-
-    Inst_SOP1__S_BITSET0_B64::~Inst_SOP1__S_BITSET0_B64()
-    {
-    } // ~Inst_SOP1__S_BITSET0_B64
-
-    // D.u64[S0.u[5:0]] = 0.
-    void
-    Inst_SOP1__S_BITSET0_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 5, 0), 0);
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_BITSET1_B32::Inst_SOP1__S_BITSET1_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset1_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET1_B32
-
-    Inst_SOP1__S_BITSET1_B32::~Inst_SOP1__S_BITSET1_B32()
-    {
-    } // ~Inst_SOP1__S_BITSET1_B32
-
-    // D.u[S0.u[4:0]] = 1.
-    void
-    Inst_SOP1__S_BITSET1_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 4, 0), 1);
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_BITSET1_B64::Inst_SOP1__S_BITSET1_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset1_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET1_B64
-
-    Inst_SOP1__S_BITSET1_B64::~Inst_SOP1__S_BITSET1_B64()
-    {
-    } // ~Inst_SOP1__S_BITSET1_B64
-
-    // D.u64[S0.u[5:0]] = 1.
-    void
-    Inst_SOP1__S_BITSET1_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 5, 0), 1);
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_GETPC_B64::Inst_SOP1__S_GETPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_getpc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_GETPC_B64
-
-    Inst_SOP1__S_GETPC_B64::~Inst_SOP1__S_GETPC_B64()
-    {
-    } // ~Inst_SOP1__S_GETPC_B64
-
-    // D.u64 = PC + 4.
-    // Destination receives the byte address of the next instruction.
-    void
-    Inst_SOP1__S_GETPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = wf->pc();
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        sdst = pc + 4;
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_SETPC_B64::Inst_SOP1__S_SETPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_setpc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SETPC_B64
-
-    Inst_SOP1__S_SETPC_B64::~Inst_SOP1__S_SETPC_B64()
-    {
-    } // ~Inst_SOP1__S_SETPC_B64
-
-    // PC = S0.u64.
-    // S0.u64 is a byte address of the instruction to jump to.
-    void
-    Inst_SOP1__S_SETPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-
-        src.read();
-
-        wf->pc(src.rawData());
-    }
-
-    Inst_SOP1__S_SWAPPC_B64::Inst_SOP1__S_SWAPPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_swappc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SWAPPC_B64
-
-    Inst_SOP1__S_SWAPPC_B64::~Inst_SOP1__S_SWAPPC_B64()
-    {
-    } // ~Inst_SOP1__S_SWAPPC_B64
-
-    // D.u64 = PC + 4; PC = S0.u64.
-    // S0.u64 is a byte address of the instruction to jump to.
-    void
-    Inst_SOP1__S_SWAPPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = wf->pc();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = pc + 4;
-
-        wf->pc(src.rawData());
-        sdst.write();
-    }
-
-    Inst_SOP1__S_RFE_B64::Inst_SOP1__S_RFE_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_rfe_b64")
-    {
-    } // Inst_SOP1__S_RFE_B64
-
-    Inst_SOP1__S_RFE_B64::~Inst_SOP1__S_RFE_B64()
-    {
-    } // ~Inst_SOP1__S_RFE_B64
-
-    // Return from exception handler and continue.
-    void
-    Inst_SOP1__S_RFE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOP1__S_AND_SAVEEXEC_B64::Inst_SOP1__S_AND_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_and_saveexec_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_AND_SAVEEXEC_B64
-
-    Inst_SOP1__S_AND_SAVEEXEC_B64::~Inst_SOP1__S_AND_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_AND_SAVEEXEC_B64
-
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 & EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_AND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() & wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_OR_SAVEEXEC_B64::Inst_SOP1__S_OR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_or_saveexec_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_OR_SAVEEXEC_B64
-
-    Inst_SOP1__S_OR_SAVEEXEC_B64::~Inst_SOP1__S_OR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_OR_SAVEEXEC_B64
-
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 | EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_OR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() | wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::Inst_SOP1__S_XOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_xor_saveexec_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_XOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::~Inst_SOP1__S_XOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_XOR_SAVEEXEC_B64
-
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 ^ EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() ^ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::Inst_SOP1__S_ANDN2_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_andn2_saveexec_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_ANDN2_SAVEEXEC_B64
-
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::~Inst_SOP1__S_ANDN2_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_ANDN2_SAVEEXEC_B64
-
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 & ~EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() &~ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::Inst_SOP1__S_ORN2_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_orn2_saveexec_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_ORN2_SAVEEXEC_B64
-
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::~Inst_SOP1__S_ORN2_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_ORN2_SAVEEXEC_B64
-
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 | ~EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() |~ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::Inst_SOP1__S_NAND_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_nand_saveexec_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_NAND_SAVEEXEC_B64
-
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::~Inst_SOP1__S_NAND_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_NAND_SAVEEXEC_B64
-
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 & EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() & wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::Inst_SOP1__S_NOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_nor_saveexec_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_NOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::~Inst_SOP1__S_NOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_NOR_SAVEEXEC_B64
-
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 | EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() | wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::Inst_SOP1__S_XNOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_xnor_saveexec_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_XNOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::~Inst_SOP1__S_XNOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_XNOR_SAVEEXEC_B64
-
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 ^ EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() ^ wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_QUADMASK_B32::Inst_SOP1__S_QUADMASK_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_quadmask_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_QUADMASK_B32
-
-    Inst_SOP1__S_QUADMASK_B32::~Inst_SOP1__S_QUADMASK_B32()
-    {
-    } // ~Inst_SOP1__S_QUADMASK_B32
-
-    // D.u = QuadMask(S0.u):
-    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[31:8] = 0;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_QUADMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = quadMask(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_QUADMASK_B64::Inst_SOP1__S_QUADMASK_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_quadmask_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_QUADMASK_B64
-
-    Inst_SOP1__S_QUADMASK_B64::~Inst_SOP1__S_QUADMASK_B64()
-    {
-    } // ~Inst_SOP1__S_QUADMASK_B64
-
-    // D.u64 = QuadMask(S0.u64):
-    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[63:16] = 0;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_QUADMASK_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = quadMask(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_MOVRELS_B32::Inst_SOP1__S_MOVRELS_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movrels_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELS_B32
-
-    Inst_SOP1__S_MOVRELS_B32::~Inst_SOP1__S_MOVRELS_B32()
-    {
-    } // ~Inst_SOP1__S_MOVRELS_B32
-
-    // D.u = SGPR[S0.u + M0.u].u (move from relative source).
-    void
-    Inst_SOP1__S_MOVRELS_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0 + m0.rawData());
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_MOVRELS_B64::Inst_SOP1__S_MOVRELS_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movrels_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELS_B64
-
-    Inst_SOP1__S_MOVRELS_B64::~Inst_SOP1__S_MOVRELS_B64()
-    {
-    } // ~Inst_SOP1__S_MOVRELS_B64
-
-    // D.u64 = SGPR[S0.u + M0.u].u64 (move from relative source).
-    // The index in M0.u must be even for this operation.
-    void
-    Inst_SOP1__S_MOVRELS_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0 + m0.rawData());
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_MOVRELD_B32::Inst_SOP1__S_MOVRELD_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movreld_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELD_B32
-
-    Inst_SOP1__S_MOVRELD_B32::~Inst_SOP1__S_MOVRELD_B32()
-    {
-    } // ~Inst_SOP1__S_MOVRELD_B32
-
-    // SGPR[D.u + M0.u].u = S0.u (move to relative destination).
-    void
-    Inst_SOP1__S_MOVRELD_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST + m0.rawData());
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_MOVRELD_B64::Inst_SOP1__S_MOVRELD_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movreld_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELD_B64
-
-    Inst_SOP1__S_MOVRELD_B64::~Inst_SOP1__S_MOVRELD_B64()
-    {
-    } // ~Inst_SOP1__S_MOVRELD_B64
-
-    // SGPR[D.u + M0.u].u64 = S0.u64 (move to relative destination).
-    // The index in M0.u must be even for this operation.
-    void
-    Inst_SOP1__S_MOVRELD_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST + m0.rawData());
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    }
-
-    Inst_SOP1__S_CBRANCH_JOIN::Inst_SOP1__S_CBRANCH_JOIN(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cbranch_join")
-    {
-        setFlag(Branch);
-    } // Inst_SOP1__S_CBRANCH_JOIN
-
-    Inst_SOP1__S_CBRANCH_JOIN::~Inst_SOP1__S_CBRANCH_JOIN()
-    {
-    } // ~Inst_SOP1__S_CBRANCH_JOIN
-
-    // Conditional branch join point (end of conditional branch block).
-    void
-    Inst_SOP1__S_CBRANCH_JOIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOP1__S_ABS_I32::Inst_SOP1__S_ABS_I32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_abs_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_ABS_I32
-
-    Inst_SOP1__S_ABS_I32::~Inst_SOP1__S_ABS_I32()
-    {
-    } // ~Inst_SOP1__S_ABS_I32
-
-    // if (S.i < 0) then D.i = -S.i;
-    // else D.i = S.i;
-    // SCC = 1 if result is non-zero.
-    // Integer absolute value.
-    void
-    Inst_SOP1__S_ABS_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = std::abs(src.rawData());
-
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    }
-
-    Inst_SOP1__S_MOV_FED_B32::Inst_SOP1__S_MOV_FED_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_fed_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_FED_B32
-
-    Inst_SOP1__S_MOV_FED_B32::~Inst_SOP1__S_MOV_FED_B32()
-    {
-    } // ~Inst_SOP1__S_MOV_FED_B32
-
-    // D.u = S0.u.
-    void
-    Inst_SOP1__S_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOP1__S_SET_GPR_IDX_IDX::Inst_SOP1__S_SET_GPR_IDX_IDX(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_set_gpr_idx_idx")
-    {
-    } // Inst_SOP1__S_SET_GPR_IDX_IDX
-
-    Inst_SOP1__S_SET_GPR_IDX_IDX::~Inst_SOP1__S_SET_GPR_IDX_IDX()
-    {
-    } // ~Inst_SOP1__S_SET_GPR_IDX_IDX
-
-    // M0[7:0] = S0.u[7:0].
-    // Modify the index used in vector GPR indexing.
-    void
-    Inst_SOP1__S_SET_GPR_IDX_IDX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPC__S_CMP_EQ_I32::Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_I32
-
-    Inst_SOPC__S_CMP_EQ_I32::~Inst_SOPC__S_CMP_EQ_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_I32
-
-    // SCC = (S0.i == S1.i).
-    void
-    Inst_SOPC__S_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_LG_I32::Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_I32
-
-    Inst_SOPC__S_CMP_LG_I32::~Inst_SOPC__S_CMP_LG_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_I32
-
-    // SCC = (S0.i != S1.i).
-    void
-    Inst_SOPC__S_CMP_LG_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_GT_I32::Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GT_I32
-
-    Inst_SOPC__S_CMP_GT_I32::~Inst_SOPC__S_CMP_GT_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_GT_I32
-
-    // SCC = (S0.i > S1.i).
-    void
-    Inst_SOPC__S_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_GE_I32::Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GE_I32
-
-    Inst_SOPC__S_CMP_GE_I32::~Inst_SOPC__S_CMP_GE_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_GE_I32
-
-    // SCC = (S0.i >= S1.i).
-    void
-    Inst_SOPC__S_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_LT_I32::Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LT_I32
-
-    Inst_SOPC__S_CMP_LT_I32::~Inst_SOPC__S_CMP_LT_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LT_I32
-
-    // SCC = (S0.i < S1.i).
-    void
-    Inst_SOPC__S_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_LE_I32::Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LE_I32
-
-    Inst_SOPC__S_CMP_LE_I32::~Inst_SOPC__S_CMP_LE_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LE_I32
-
-    // SCC = (S0.i <= S1.i).
-    void
-    Inst_SOPC__S_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_EQ_U32::Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_U32
-
-    Inst_SOPC__S_CMP_EQ_U32::~Inst_SOPC__S_CMP_EQ_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_U32
-
-    // SCC = (S0.u == S1.u).
-    void
-    Inst_SOPC__S_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_LG_U32::Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_U32
-
-    Inst_SOPC__S_CMP_LG_U32::~Inst_SOPC__S_CMP_LG_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_U32
-
-    // SCC = (S0.u != S1.u).
-    void
-    Inst_SOPC__S_CMP_LG_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_GT_U32::Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GT_U32
-
-    Inst_SOPC__S_CMP_GT_U32::~Inst_SOPC__S_CMP_GT_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_GT_U32
-
-    // SCC = (S0.u > S1.u).
-    void
-    Inst_SOPC__S_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_GE_U32::Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GE_U32
-
-    Inst_SOPC__S_CMP_GE_U32::~Inst_SOPC__S_CMP_GE_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_GE_U32
-
-    // SCC = (S0.u >= S1.u).
-    void
-    Inst_SOPC__S_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_LT_U32::Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LT_U32
-
-    Inst_SOPC__S_CMP_LT_U32::~Inst_SOPC__S_CMP_LT_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LT_U32
-
-    // SCC = (S0.u < S1.u).
-    void
-    Inst_SOPC__S_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_LE_U32::Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LE_U32
-
-    Inst_SOPC__S_CMP_LE_U32::~Inst_SOPC__S_CMP_LE_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LE_U32
-
-    // SCC = (S0.u <= S1.u).
-    void
-    Inst_SOPC__S_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_BITCMP0_B32::Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp0_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP0_B32
-
-    Inst_SOPC__S_BITCMP0_B32::~Inst_SOPC__S_BITCMP0_B32()
-    {
-    } // ~Inst_SOPC__S_BITCMP0_B32
-
-    // SCC = (S0.u[S1.u[4:0]] == 0).
-    void
-    Inst_SOPC__S_BITCMP0_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = !bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_BITCMP1_B32::Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp1_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP1_B32
-
-    Inst_SOPC__S_BITCMP1_B32::~Inst_SOPC__S_BITCMP1_B32()
-    {
-    } // ~Inst_SOPC__S_BITCMP1_B32
-
-    // SCC = (S0.u[S1.u[4:0]] == 1).
-    void
-    Inst_SOPC__S_BITCMP1_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_BITCMP0_B64::Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp0_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP0_B64
-
-    Inst_SOPC__S_BITCMP0_B64::~Inst_SOPC__S_BITCMP0_B64()
-    {
-    } // ~Inst_SOPC__S_BITCMP0_B64
-
-    // SCC = (S0.u64[S1.u[5:0]] == 0).
-    void
-    Inst_SOPC__S_BITCMP0_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = !bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_BITCMP1_B64::Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp1_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP1_B64
-
-    Inst_SOPC__S_BITCMP1_B64::~Inst_SOPC__S_BITCMP1_B64()
-    {
-    } // ~Inst_SOPC__S_BITCMP1_B64
-
-    // SCC = (S0.u64[S1.u[5:0]] == 1).
-    void
-    Inst_SOPC__S_BITCMP1_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_SETVSKIP::Inst_SOPC__S_SETVSKIP(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_setvskip")
-    {
-        setFlag(UnconditionalJump);
-    } // Inst_SOPC__S_SETVSKIP
-
-    Inst_SOPC__S_SETVSKIP::~Inst_SOPC__S_SETVSKIP()
-    {
-    } // ~Inst_SOPC__S_SETVSKIP
-
-    // VSKIP = S0.u[S1.u[4:0]].
-    // Enables and disables VSKIP mode.
-    // When VSKIP is enabled, no VOP*/M*BUF/MIMG/DS/FLAT/EXP instuctions are
-    // issued.
-    void
-    Inst_SOPC__S_SETVSKIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPC__S_SET_GPR_IDX_ON::Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_set_gpr_idx_on")
-    {
-    } // Inst_SOPC__S_SET_GPR_IDX_ON
-
-    Inst_SOPC__S_SET_GPR_IDX_ON::~Inst_SOPC__S_SET_GPR_IDX_ON()
-    {
-    } // ~Inst_SOPC__S_SET_GPR_IDX_ON
-
-    // MODE.gpr_idx_en = 1;
-    // M0[7:0] = S0.u[7:0];
-    // M0[15:12] = SIMM4 (direct contents of S1 field);
-    // Remaining bits of M0 are unmodified.
-    // Enable GPR indexing mode. Vector operations after this will perform
-    // relative GPR addressing based on the contents of M0.
-    // The raw contents of the S1 field are read and used to set the enable
-    // bits. S1[0] = VSRC0_REL, S1[1] = VSRC1_REL, S1[2] = VSRC2_REL and
-    // S1[3] = VDST_REL.
-    void
-    Inst_SOPC__S_SET_GPR_IDX_ON::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPC__S_CMP_EQ_U64::Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_U64
-
-    Inst_SOPC__S_CMP_EQ_U64::~Inst_SOPC__S_CMP_EQ_U64()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_U64
-
-    // SCC = (S0.i64 == S1.i64).
-    void
-    Inst_SOPC__S_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPC__S_CMP_LG_U64::Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_U64
-
-    Inst_SOPC__S_CMP_LG_U64::~Inst_SOPC__S_CMP_LG_U64()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_U64
-
-    // SCC = (S0.i64 != S1.i64).
-    void
-    Inst_SOPC__S_CMP_LG_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    }
-
-    Inst_SOPP__S_NOP::Inst_SOPP__S_NOP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_nop")
-    {
-        setFlag(Nop);
-    } // Inst_SOPP__S_NOP
-
-    Inst_SOPP__S_NOP::~Inst_SOPP__S_NOP()
-    {
-    } // ~Inst_SOPP__S_NOP
-
-    // Do nothing.
-    void
-    Inst_SOPP__S_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_SOPP__S_ENDPGM::Inst_SOPP__S_ENDPGM(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_endpgm")
-    {
-        setFlag(EndOfKernel);
-    } // Inst_SOPP__S_ENDPGM
-
-    Inst_SOPP__S_ENDPGM::~Inst_SOPP__S_ENDPGM()
-    {
-    } // ~Inst_SOPP__S_ENDPGM
-
-    // End of program; terminate wavefront.
-    void
-    Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ComputeUnit *cu = gpuDynInst->computeUnit();
-
-        // delete extra instructions fetched for completed work-items
-        wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
-            wf->instructionBuffer.end());
-
-        if (wf->pendingFetch) {
-            wf->dropFetch = true;
-        }
-
-        wf->computeUnit->fetchStage.fetchUnit(wf->simdId)
-            .flushBuf(wf->wfSlotId);
-        wf->setStatus(Wavefront::S_STOPPED);
-
-        int refCount = wf->computeUnit->getLds()
-            .decreaseRefCounter(wf->dispatchId, wf->wgId);
-
-        /**
-         * The parent WF of this instruction is exiting, therefore
-         * it should not participate in this barrier any longer. This
-         * prevents possible deadlock issues if WFs exit early.
-         */
-        int bar_id = WFBarrier::InvalidID;
-        if (wf->hasBarrier()) {
-            assert(wf->getStatus() != Wavefront::S_BARRIER);
-            bar_id = wf->barrierId();
-            assert(bar_id != WFBarrier::InvalidID);
-            wf->releaseBarrier();
-            cu->decMaxBarrierCnt(bar_id);
-            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
-                    "program and decrementing max barrier count for "
-                    "barrier Id%d. New max count: %d.\n", cu->cu_id,
-                    wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
-                    cu->maxBarrierCnt(bar_id));
-        }
-
-        DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
-            wf->computeUnit->cu_id, wf->wgId, refCount);
-
-        wf->computeUnit->registerManager->freeRegisters(wf);
-        wf->computeUnit->stats.completedWfs++;
-        wf->computeUnit->activeWaves--;
-
-        panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
-            "than zero\n", wf->computeUnit->cu_id);
-
-        DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
-            wf->computeUnit->cu_id, wf->simdId, wf->wfSlotId, wf->wfDynId);
-
-        for (int i = 0; i < wf->vecReads.size(); i++) {
-            if (wf->rawDist.find(i) != wf->rawDist.end()) {
-                wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
-            }
-        }
-        wf->vecReads.clear();
-        wf->rawDist.clear();
-        wf->lastInstExec = 0;
-
-        if (!refCount) {
-            /**
-             * If all WFs have finished, and hence the WG has finished,
-             * then we can free up the barrier belonging to the parent
-             * WG, but only if we actually used a barrier (i.e., more
-             * than one WF in the WG).
-             */
-            if (bar_id != WFBarrier::InvalidID) {
-                DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
-                        "now complete. Releasing barrier Id%d.\n", cu->cu_id,
-                        wf->simdId, wf->wfSlotId, wf->wfDynId,
-                        wf->barrierId());
-                cu->releaseBarrier(bar_id);
-            }
-
-           /**
-             * Last wavefront of the workgroup has executed return. If the
-             * workgroup is not the final one in the kernel, then simply
-             * retire it; however, if it is the final one (i.e., indicating
-             * the kernel end) then release operation is needed.
-             */
-
-            // check whether the workgroup is indicating the kernel end (i.e.,
-            // the last workgroup in the kernel).
-            bool kernelEnd =
-                wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
-            // further check whether 'release @ kernel end' is needed
-            bool relNeeded =
-                wf->computeUnit->shader->impl_kern_end_rel;
-
-            // if not a kernel end or no release needed, retire the workgroup
-            // directly
-            if (!kernelEnd || !relNeeded) {
-                wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
-                wf->setStatus(Wavefront::S_STOPPED);
-                wf->computeUnit->stats.completedWGs++;
-
-                return;
-            }
-
-            /**
-             * If a kernel end and release needed, inject a memory sync and
-             * retire the workgroup after receving all acks.
-             */
-            setFlag(MemSync);
-            setFlag(GlobalSegment);
-            // Notify Memory System of Kernel Completion
-            wf->setStatus(Wavefront::S_RETURNING);
-            gpuDynInst->simdId = wf->simdId;
-            gpuDynInst->wfSlotId = wf->wfSlotId;
-            gpuDynInst->wfDynId = wf->wfDynId;
-
-            DPRINTF(GPUExec, "inject global memory fence for CU%d: "
-                            "WF[%d][%d][%d]\n", wf->computeUnit->cu_id,
-                            wf->simdId, wf->wfSlotId, wf->wfDynId);
-
-            // call shader to prepare the flush operations
-            wf->computeUnit->shader->prepareFlush(gpuDynInst);
-
-            wf->computeUnit->stats.completedWGs++;
-        } else {
-            wf->computeUnit->shader->dispatcher().scheduleDispatch();
-        }
-    }
-
-
-    Inst_SOPP__S_BRANCH::Inst_SOPP__S_BRANCH(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_branch")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_BRANCH
-
-    Inst_SOPP__S_BRANCH::~Inst_SOPP__S_BRANCH()
-    {
-    } // ~Inst_SOPP__S_BRANCH
-
-    // PC = PC + signext(SIMM16 * 4) + 4 (short jump).
-    void
-    Inst_SOPP__S_BRANCH::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = wf->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-
-        pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
-
-        wf->pc(pc);
-    }
-
-    Inst_SOPP__S_WAKEUP::Inst_SOPP__S_WAKEUP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_wakeup")
-    {
-    } // Inst_SOPP__S_WAKEUP
-
-    Inst_SOPP__S_WAKEUP::~Inst_SOPP__S_WAKEUP()
-    {
-    } // ~Inst_SOPP__S_WAKEUP
-
-    // Allow a wave to wakeup all the other waves in its workgroup to force
-    // them to wake up immediately from an S_SLEEP instruction. The wakeup is
-    // ignored if the waves are not sleeping.
-    void
-    Inst_SOPP__S_WAKEUP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_CBRANCH_SCC0::Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_scc0")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_SCC0
-
-    Inst_SOPP__S_CBRANCH_SCC0::~Inst_SOPP__S_CBRANCH_SCC0()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_SCC0
-
-    // if (SCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_SCC0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = wf->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (!scc.rawData()) {
-            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
-        }
-
-        wf->pc(pc);
-    }
-
-    Inst_SOPP__S_CBRANCH_SCC1::Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_scc1")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_SCC1
-
-    Inst_SOPP__S_CBRANCH_SCC1::~Inst_SOPP__S_CBRANCH_SCC1()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_SCC1
-
-    // if (SCC == 1) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_SCC1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = wf->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (scc.rawData()) {
-            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
-        }
-
-        wf->pc(pc);
-    }
-
-    Inst_SOPP__S_CBRANCH_VCCZ::Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_vccz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsVCC);
-    } // Inst_SOPP__S_CBRANCH_VCCZ
-
-    Inst_SOPP__S_CBRANCH_VCCZ::~Inst_SOPP__S_CBRANCH_VCCZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_VCCZ
-
-    // if (VCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_VCCZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-        Addr pc = wf->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-
-        vcc.read();
-
-        if (!vcc.rawData()) {
-            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
-        }
-
-        wf->pc(pc);
-    }
-
-    Inst_SOPP__S_CBRANCH_VCCNZ::Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_vccnz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsVCC);
-    } // Inst_SOPP__S_CBRANCH_VCCNZ
-
-    Inst_SOPP__S_CBRANCH_VCCNZ::~Inst_SOPP__S_CBRANCH_VCCNZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_VCCNZ
-
-    // if (VCC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_VCCNZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        vcc.read();
-
-        if (vcc.rawData()) {
-            Addr pc = wf->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
-            wf->pc(pc);
-        }
-    }
-
-    Inst_SOPP__S_CBRANCH_EXECZ::Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_execz")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_EXECZ
-
-    Inst_SOPP__S_CBRANCH_EXECZ::~Inst_SOPP__S_CBRANCH_EXECZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_EXECZ
-
-    // if (EXEC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_EXECZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (wf->execMask().none()) {
-            Addr pc = wf->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
-            wf->pc(pc);
-        }
-    }
-
-    Inst_SOPP__S_CBRANCH_EXECNZ::Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_execnz")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_EXECNZ
-
-    Inst_SOPP__S_CBRANCH_EXECNZ::~Inst_SOPP__S_CBRANCH_EXECNZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_EXECNZ
-
-    // if (EXEC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_EXECNZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (wf->execMask().any()) {
-            Addr pc = wf->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)sext<18>(simm16 * 4LL)) + 4LL;
-            wf->pc(pc);
-        }
-    }
-
-    Inst_SOPP__S_BARRIER::Inst_SOPP__S_BARRIER(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_barrier")
-    {
-        setFlag(MemBarrier);
-    } // Inst_SOPP__S_BARRIER
-
-    Inst_SOPP__S_BARRIER::~Inst_SOPP__S_BARRIER()
-    {
-    } // ~Inst_SOPP__S_BARRIER
-
-    /**
-     * Synchronize waves within a workgroup. If not all waves of the workgroup
-     * have been created yet, wait for entire group before proceeding. If some
-     * waves in the wokgroup have already terminated, this waits on only the
-     * surviving waves.
-     */
-    void
-    Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ComputeUnit *cu = gpuDynInst->computeUnit();
-
-        if (wf->hasBarrier()) {
-            int bar_id = wf->barrierId();
-            cu->incNumAtBarrier(bar_id);
-            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
-                    "barrier Id%d. %d waves now at barrier, %d waves "
-                    "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
-                    wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
-                    cu->numYetToReachBarrier(bar_id));
-        }
-    } // execute
-    // --- Inst_SOPP__S_SETKILL class methods ---
-
-    Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_setkill")
-    {
-    } // Inst_SOPP__S_SETKILL
-
-    Inst_SOPP__S_SETKILL::~Inst_SOPP__S_SETKILL()
-    {
-    } // ~Inst_SOPP__S_SETKILL
-
-    void
-    Inst_SOPP__S_SETKILL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_WAITCNT::Inst_SOPP__S_WAITCNT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_waitcnt")
-    {
-        setFlag(ALU);
-        setFlag(Waitcnt);
-    } // Inst_SOPP__S_WAITCNT
-
-    Inst_SOPP__S_WAITCNT::~Inst_SOPP__S_WAITCNT()
-    {
-    } // ~Inst_SOPP__S_WAITCNT
-
-    // Wait for the counts of outstanding lds, vector-memory and
-    // export/vmem-write-data to be at or below the specified levels.
-    // SIMM16[3:0] = vmcount (vector memory operations),
-    // SIMM16[6:4] = export/mem-write-data count,
-    // SIMM16[12:8] = LGKM_cnt (scalar-mem/GDS/LDS count).
-    void
-    Inst_SOPP__S_WAITCNT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 vm_cnt = 0;
-        ScalarRegI32 exp_cnt = 0;
-        ScalarRegI32 lgkm_cnt = 0;
-        vm_cnt = bits<ScalarRegI16>(instData.SIMM16, 3, 0);
-        exp_cnt = bits<ScalarRegI16>(instData.SIMM16, 6, 4);
-        lgkm_cnt = bits<ScalarRegI16>(instData.SIMM16, 12, 8);
-        gpuDynInst->wavefront()->setWaitCnts(vm_cnt, exp_cnt, lgkm_cnt);
-    }
-
-    Inst_SOPP__S_SETHALT::Inst_SOPP__S_SETHALT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sethalt")
-    {
-    } // Inst_SOPP__S_SETHALT
-
-    Inst_SOPP__S_SETHALT::~Inst_SOPP__S_SETHALT()
-    {
-    } // ~Inst_SOPP__S_SETHALT
-
-    void
-    Inst_SOPP__S_SETHALT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_SLEEP::Inst_SOPP__S_SLEEP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sleep")
-    {
-        setFlag(ALU);
-        setFlag(Sleep);
-    } // Inst_SOPP__S_SLEEP
-
-    Inst_SOPP__S_SLEEP::~Inst_SOPP__S_SLEEP()
-    {
-    } // ~Inst_SOPP__S_SLEEP
-
-    // Cause a wave to sleep for (64 * SIMM16[2:0] + 1..64) clocks.
-    void
-    Inst_SOPP__S_SLEEP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
-        gpuDynInst->wavefront()->setStatus(Wavefront::S_STALLED_SLEEP);
-        // sleep duration is specified in multiples of 64 cycles
-        gpuDynInst->wavefront()->setSleepTime(64 * simm16);
-    } // execute
-    // --- Inst_SOPP__S_SETPRIO class methods ---
-
-    Inst_SOPP__S_SETPRIO::Inst_SOPP__S_SETPRIO(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_setprio")
-    {
-    } // Inst_SOPP__S_SETPRIO
-
-    Inst_SOPP__S_SETPRIO::~Inst_SOPP__S_SETPRIO()
-    {
-    } // ~Inst_SOPP__S_SETPRIO
-
-    // User settable wave priority is set to SIMM16[1:0]. 0 = lowest,
-    // 3 = highest.
-    void
-    Inst_SOPP__S_SETPRIO::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_SENDMSG::Inst_SOPP__S_SENDMSG(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sendmsg")
-    {
-    } // Inst_SOPP__S_SENDMSG
-
-    Inst_SOPP__S_SENDMSG::~Inst_SOPP__S_SENDMSG()
-    {
-    } // ~Inst_SOPP__S_SENDMSG
-
-    void
-    Inst_SOPP__S_SENDMSG::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_SENDMSGHALT::Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sendmsghalt")
-    {
-    } // Inst_SOPP__S_SENDMSGHALT
-
-    Inst_SOPP__S_SENDMSGHALT::~Inst_SOPP__S_SENDMSGHALT()
-    {
-    } // ~Inst_SOPP__S_SENDMSGHALT
-
-    void
-    Inst_SOPP__S_SENDMSGHALT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_TRAP::Inst_SOPP__S_TRAP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_trap")
-    {
-    } // Inst_SOPP__S_TRAP
-
-    Inst_SOPP__S_TRAP::~Inst_SOPP__S_TRAP()
-    {
-    } // ~Inst_SOPP__S_TRAP
-
-    // Enter the trap handler.
-    void
-    Inst_SOPP__S_TRAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_icache_inv")
-    {
-    } // Inst_SOPP__S_ICACHE_INV
-
-    Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
-    {
-    } // ~Inst_SOPP__S_ICACHE_INV
-
-    // Invalidate entire L1 instruction cache.
-    void
-    Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_INCPERFLEVEL::Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_incperflevel")
-    {
-    } // Inst_SOPP__S_INCPERFLEVEL
-
-    Inst_SOPP__S_INCPERFLEVEL::~Inst_SOPP__S_INCPERFLEVEL()
-    {
-    } // ~Inst_SOPP__S_INCPERFLEVEL
-
-    void
-    Inst_SOPP__S_INCPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_DECPERFLEVEL::Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_decperflevel")
-    {
-    } // Inst_SOPP__S_DECPERFLEVEL
-
-    Inst_SOPP__S_DECPERFLEVEL::~Inst_SOPP__S_DECPERFLEVEL()
-    {
-    } // ~Inst_SOPP__S_DECPERFLEVEL
-
-    void
-    Inst_SOPP__S_DECPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_TTRACEDATA::Inst_SOPP__S_TTRACEDATA(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_ttracedata")
-    {
-    } // Inst_SOPP__S_TTRACEDATA
-
-    Inst_SOPP__S_TTRACEDATA::~Inst_SOPP__S_TTRACEDATA()
-    {
-    } // ~Inst_SOPP__S_TTRACEDATA
-
-    void
-    Inst_SOPP__S_TTRACEDATA::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS::Inst_SOPP__S_CBRANCH_CDBGSYS(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS::~Inst_SOPP__S_CBRANCH_CDBGSYS()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS
-
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_CBRANCH_CDBGUSER::Inst_SOPP__S_CBRANCH_CDBGUSER(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbguser")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGUSER
-
-    Inst_SOPP__S_CBRANCH_CDBGUSER::~Inst_SOPP__S_CBRANCH_CDBGUSER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGUSER
-
-    void
-    Inst_SOPP__S_CBRANCH_CDBGUSER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_or_user")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::
-        ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
-
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
-        Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP *iFmt)
-            : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_and_user")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
-        ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
-
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_ENDPGM_SAVED::Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_endpgm_saved")
-    {
-    } // Inst_SOPP__S_ENDPGM_SAVED
-
-    Inst_SOPP__S_ENDPGM_SAVED::~Inst_SOPP__S_ENDPGM_SAVED()
-    {
-    } // ~Inst_SOPP__S_ENDPGM_SAVED
-
-    // End of program.
-    void
-    Inst_SOPP__S_ENDPGM_SAVED::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_SET_GPR_IDX_OFF::Inst_SOPP__S_SET_GPR_IDX_OFF(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_set_gpr_idx_off")
-    {
-    } // Inst_SOPP__S_SET_GPR_IDX_OFF
-
-    Inst_SOPP__S_SET_GPR_IDX_OFF::~Inst_SOPP__S_SET_GPR_IDX_OFF()
-    {
-    } // ~Inst_SOPP__S_SET_GPR_IDX_OFF
-
-    // MODE.gpr_idx_en = 0.
-    // Clear GPR indexing mode. Vector operations after this will not perform
-    // relative GPR addressing regardless of the contents of M0.
-    void
-    Inst_SOPP__S_SET_GPR_IDX_OFF::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SOPP__S_SET_GPR_IDX_MODE::Inst_SOPP__S_SET_GPR_IDX_MODE(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_set_gpr_idx_mode")
-    {
-    } // Inst_SOPP__S_SET_GPR_IDX_MODE
-
-    Inst_SOPP__S_SET_GPR_IDX_MODE::~Inst_SOPP__S_SET_GPR_IDX_MODE()
-    {
-    } // ~Inst_SOPP__S_SET_GPR_IDX_MODE
-
-    // M0[15:12] = SIMM4.
-    // Modify the mode used for vector GPR indexing.
-    // The raw contents of the source field are read and used to set the enable
-    // bits. SIMM4[0] = VSRC0_REL, SIMM4[1] = VSRC1_REL, SIMM4[2] = VSRC2_REL
-    // and SIMM4[3] = VDST_REL.
-    void
-    Inst_SOPP__S_SET_GPR_IDX_MODE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SMEM__S_LOAD_DWORD::Inst_SMEM__S_LOAD_DWORD(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORD
-
-    Inst_SMEM__S_LOAD_DWORD::~Inst_SMEM__S_LOAD_DWORD()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORD
-
-    /**
-     * Read 1 dword from scalar data cache. If the offset is specified as an
-     * sgpr, the sgpr contains an unsigned byte offset (the 2 LSBs are
-     * ignored). If the offset is specified as an immediate 20-bit constant,
-     * the constant is an unsigned byte offset.
-     */
-    void
-    Inst_SMEM__S_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_SMEM__S_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-
-    Inst_SMEM__S_LOAD_DWORDX2::Inst_SMEM__S_LOAD_DWORDX2(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX2
-
-    Inst_SMEM__S_LOAD_DWORDX2::~Inst_SMEM__S_LOAD_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX2
-
-    /**
-     * Read 2 dwords from scalar data cache. See s_load_dword for details on
-     * the offset input.
-     */
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-
-    Inst_SMEM__S_LOAD_DWORDX4::Inst_SMEM__S_LOAD_DWORDX4(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX4
-
-    Inst_SMEM__S_LOAD_DWORDX4::~Inst_SMEM__S_LOAD_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX4
-
-    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-
-    Inst_SMEM__S_LOAD_DWORDX8::Inst_SMEM__S_LOAD_DWORDX8(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX8
-
-    Inst_SMEM__S_LOAD_DWORDX8::~Inst_SMEM__S_LOAD_DWORDX8()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX8
-
-    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-
-    Inst_SMEM__S_LOAD_DWORDX16::Inst_SMEM__S_LOAD_DWORDX16(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX16
-
-    Inst_SMEM__S_LOAD_DWORDX16::~Inst_SMEM__S_LOAD_DWORDX16()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX16
-
-    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::Inst_SMEM__S_BUFFER_LOAD_DWORD(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORD
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::~Inst_SMEM__S_BUFFER_LOAD_DWORD()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORD
-
-    // Read 1 dword from scalar data cache. See S_LOAD_DWORD for details on the
-    // offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 1 request, size 32
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX2
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::~Inst_SMEM__S_BUFFER_LOAD_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX2
-
-    // Read 2 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // use U64 because 2 requests, each size 32
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX4
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::~Inst_SMEM__S_BUFFER_LOAD_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX4
-
-    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 4 requests, each size 32
-        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::Inst_SMEM__S_BUFFER_LOAD_DWORDX8(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX8
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::~Inst_SMEM__S_BUFFER_LOAD_DWORDX8()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX8
-
-    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 8 requests, each size 32
-        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::Inst_SMEM__S_BUFFER_LOAD_DWORDX16(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX16
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::~Inst_SMEM__S_BUFFER_LOAD_DWORDX16()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX16
-
-    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 16 requests, each size 32
-        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-
-    Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORD
-
-    Inst_SMEM__S_STORE_DWORD::~Inst_SMEM__S_STORE_DWORD()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORD
-
-    // Write 1 dword to scalar data cache.
-    // If the offset is specified as an SGPR, the SGPR contains an unsigned
-    // BYTE offset (the 2 LSBs are ignored).
-    // If the offset is specified as an immediate 20-bit constant, the
-    // constant is an unsigned BYTE offset.
-    void
-    Inst_SMEM__S_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(ScalarRegU32));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_SMEM__S_STORE_DWORDX2::Inst_SMEM__S_STORE_DWORDX2(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORDX2
-
-    Inst_SMEM__S_STORE_DWORDX2::~Inst_SMEM__S_STORE_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORDX2
-
-    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(ScalarRegU64));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_SMEM__S_STORE_DWORDX4::Inst_SMEM__S_STORE_DWORDX4(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORDX4
-
-    Inst_SMEM__S_STORE_DWORDX4::~Inst_SMEM__S_STORE_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORDX4
-
-    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU128 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            4 * sizeof(ScalarRegU32));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_SMEM__S_BUFFER_STORE_DWORD::Inst_SMEM__S_BUFFER_STORE_DWORD(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORD
-
-    Inst_SMEM__S_BUFFER_STORE_DWORD::~Inst_SMEM__S_BUFFER_STORE_DWORD()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORD
-
-    // Write 1 dword to scalar data cache. See S_STORE_DWORD for details on the
-    // offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::Inst_SMEM__S_BUFFER_STORE_DWORDX2(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORDX2
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::~Inst_SMEM__S_BUFFER_STORE_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX2
-
-    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::Inst_SMEM__S_BUFFER_STORE_DWORDX4(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORDX4
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::~Inst_SMEM__S_BUFFER_STORE_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX4
-
-    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_SMEM__S_DCACHE_INV::Inst_SMEM__S_DCACHE_INV(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_inv")
-    {
-    } // Inst_SMEM__S_DCACHE_INV
-
-    Inst_SMEM__S_DCACHE_INV::~Inst_SMEM__S_DCACHE_INV()
-    {
-    } // ~Inst_SMEM__S_DCACHE_INV
-
-    // Invalidate the scalar data cache.
-    void
-    Inst_SMEM__S_DCACHE_INV::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SMEM__S_DCACHE_WB::Inst_SMEM__S_DCACHE_WB(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_wb")
-    {
-    } // Inst_SMEM__S_DCACHE_WB
-
-    Inst_SMEM__S_DCACHE_WB::~Inst_SMEM__S_DCACHE_WB()
-    {
-    } // ~Inst_SMEM__S_DCACHE_WB
-
-    // Write back dirty data in the scalar data cache.
-    void
-    Inst_SMEM__S_DCACHE_WB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SMEM__S_DCACHE_INV_VOL::Inst_SMEM__S_DCACHE_INV_VOL(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_inv_vol")
-    {
-    } // Inst_SMEM__S_DCACHE_INV_VOL
-
-    Inst_SMEM__S_DCACHE_INV_VOL::~Inst_SMEM__S_DCACHE_INV_VOL()
-    {
-    } // ~Inst_SMEM__S_DCACHE_INV_VOL
-
-    // Invalidate the scalar data cache volatile lines.
-    void
-    Inst_SMEM__S_DCACHE_INV_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SMEM__S_DCACHE_WB_VOL::Inst_SMEM__S_DCACHE_WB_VOL(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_wb_vol")
-    {
-    } // Inst_SMEM__S_DCACHE_WB_VOL
-
-    Inst_SMEM__S_DCACHE_WB_VOL::~Inst_SMEM__S_DCACHE_WB_VOL()
-    {
-    } // ~Inst_SMEM__S_DCACHE_WB_VOL
-
-    // Write back dirty data in the scalar data cache volatile lines.
-    void
-    Inst_SMEM__S_DCACHE_WB_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SMEM__S_MEMTIME::Inst_SMEM__S_MEMTIME(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_memtime")
-    {
-        // s_memtime does not issue a memory request
-        setFlag(ALU);
-    } // Inst_SMEM__S_MEMTIME
-
-    Inst_SMEM__S_MEMTIME::~Inst_SMEM__S_MEMTIME()
-    {
-    } // ~Inst_SMEM__S_MEMTIME
-
-    // Return current 64-bit timestamp.
-    void
-    Inst_SMEM__S_MEMTIME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst = (ScalarRegU64)gpuDynInst->computeUnit()->curCycle();
-        sdst.write();
-    }
-
-    Inst_SMEM__S_MEMREALTIME::Inst_SMEM__S_MEMREALTIME(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_memrealtime")
-    {
-    } // Inst_SMEM__S_MEMREALTIME
-
-    Inst_SMEM__S_MEMREALTIME::~Inst_SMEM__S_MEMREALTIME()
-    {
-    } // ~Inst_SMEM__S_MEMREALTIME
-
-    // Return current 64-bit RTC.
-    void
-    Inst_SMEM__S_MEMREALTIME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SMEM__S_ATC_PROBE::Inst_SMEM__S_ATC_PROBE(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_atc_probe")
-    {
-    } // Inst_SMEM__S_ATC_PROBE
-
-    Inst_SMEM__S_ATC_PROBE::~Inst_SMEM__S_ATC_PROBE()
-    {
-    } // ~Inst_SMEM__S_ATC_PROBE
-
-    void
-    Inst_SMEM__S_ATC_PROBE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_SMEM__S_ATC_PROBE_BUFFER::Inst_SMEM__S_ATC_PROBE_BUFFER(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_atc_probe_buffer")
-    {
-    } // Inst_SMEM__S_ATC_PROBE_BUFFER
-
-    Inst_SMEM__S_ATC_PROBE_BUFFER::~Inst_SMEM__S_ATC_PROBE_BUFFER()
-    {
-    } // ~Inst_SMEM__S_ATC_PROBE_BUFFER
-
-    void
-    Inst_SMEM__S_ATC_PROBE_BUFFER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP2__V_CNDMASK_B32::Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_cndmask_b32")
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_CNDMASK_B32
-
-    Inst_VOP2__V_CNDMASK_B32::~Inst_VOP2__V_CNDMASK_B32()
-    {
-    } // ~Inst_VOP2__V_CNDMASK_B32
-
-    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
-    // as a scalar GPR in S2.
-    void
-    Inst_VOP2__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_ADD_F32::Inst_VOP2__V_ADD_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_ADD_F32
-
-    Inst_VOP2__V_ADD_F32::~Inst_VOP2__V_ADD_F32()
-    {
-    } // ~Inst_VOP2__V_ADD_F32
-
-    // D.f = S0.f + S1.f.
-    void
-    Inst_VOP2__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isDPPInst()) {
-            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src0_dpp.read();
-
-            DPRINTF(GCN3, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BOUND_CTRL: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BOUND_CTRL,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_dpp[lane] + src1[lane];
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] + src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_SUB_F32::Inst_VOP2__V_SUB_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_SUB_F32
-
-    Inst_VOP2__V_SUB_F32::~Inst_VOP2__V_SUB_F32()
-    {
-    } // ~Inst_VOP2__V_SUB_F32
-
-    // D.f = S0.f - S1.f.
-    void
-    Inst_VOP2__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_SUBREV_F32::Inst_VOP2__V_SUBREV_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_SUBREV_F32
-
-    Inst_VOP2__V_SUBREV_F32::~Inst_VOP2__V_SUBREV_F32()
-    {
-    } // ~Inst_VOP2__V_SUBREV_F32
-
-    // D.f = S1.f - S0.f.
-    void
-    Inst_VOP2__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MUL_LEGACY_F32::Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MUL_LEGACY_F32
-
-    Inst_VOP2__V_MUL_LEGACY_F32::~Inst_VOP2__V_MUL_LEGACY_F32()
-    {
-    } // ~Inst_VOP2__V_MUL_LEGACY_F32
-
-    // D.f = S0.f * S1.f
-    void
-    Inst_VOP2__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MUL_F32::Inst_VOP2__V_MUL_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MUL_F32
-
-    Inst_VOP2__V_MUL_F32::~Inst_VOP2__V_MUL_F32()
-    {
-    } // ~Inst_VOP2__V_MUL_F32
-
-    // D.f = S0.f * S1.f.
-    void
-    Inst_VOP2__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MUL_I32_I24::Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_i32_i24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_I32_I24
-
-    Inst_VOP2__V_MUL_I32_I24::~Inst_VOP2__V_MUL_I32_I24()
-    {
-    } // ~Inst_VOP2__V_MUL_I32_I24
-
-    // D.i = S0.i[23:0] * S1.i[23:0].
-    void
-    Inst_VOP2__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = szext<24>(src0[lane]) * szext<24>(src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MUL_HI_I32_I24::Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_HI_I32_I24
-
-    Inst_VOP2__V_MUL_HI_I32_I24::~Inst_VOP2__V_MUL_HI_I32_I24()
-    {
-    } // ~Inst_VOP2__V_MUL_HI_I32_I24
-
-    // D.i = (S0.i[23:0] * S1.i[23:0]) >> 32.
-    void
-    Inst_VOP2__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 tmp_src0 = (VecElemI64)szext<24>(src0[lane]);
-                VecElemI64 tmp_src1 = (VecElemI64)szext<24>(src1[lane]);
-
-                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MUL_U32_U24::Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_u32_u24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_U32_U24
-
-    Inst_VOP2__V_MUL_U32_U24::~Inst_VOP2__V_MUL_U32_U24()
-    {
-    } // ~Inst_VOP2__V_MUL_U32_U24
-
-    // D.u = S0.u[23:0] * S1.u[23:0].
-    void
-    Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(GCN3, "Handling V_MUL_U32_U24 SRC SDWA. SRC0: register "
-                    "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
-                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
-                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_UNUSED,
-                    extData.iFmt_VOP_SDWA.CLAMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = bits(src0_sdwa[lane], 23, 0) *
-                                 bits(src1[lane], 23, 0);
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = bits(src0[lane], 23, 0) *
-                                 bits(src1[lane], 23, 0);
-                }
-            }
-        }
-
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MUL_HI_U32_U24::Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_HI_U32_U24
-
-    Inst_VOP2__V_MUL_HI_U32_U24::~Inst_VOP2__V_MUL_HI_U32_U24()
-    {
-    } // ~Inst_VOP2__V_MUL_HI_U32_U24
-
-    // D.i = (S0.u[23:0] * S1.u[23:0]) >> 32.
-    void
-    Inst_VOP2__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
-                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
-                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MIN_F32::Inst_VOP2__V_MIN_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MIN_F32
-
-    Inst_VOP2__V_MIN_F32::~Inst_VOP2__V_MIN_F32()
-    {
-    } // ~Inst_VOP2__V_MIN_F32
-
-    // D.f = (S0.f < S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP2__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MAX_F32::Inst_VOP2__V_MAX_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MAX_F32
-
-    Inst_VOP2__V_MAX_F32::~Inst_VOP2__V_MAX_F32()
-    {
-    } // ~Inst_VOP2__V_MAX_F32
-
-    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP2__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MIN_I32::Inst_VOP2__V_MIN_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_I32
-
-    Inst_VOP2__V_MIN_I32::~Inst_VOP2__V_MIN_I32()
-    {
-    } // ~Inst_VOP2__V_MIN_I32
-
-    // D.i = min(S0.i, S1.i).
-    void
-    Inst_VOP2__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MAX_I32::Inst_VOP2__V_MAX_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_I32
-
-    Inst_VOP2__V_MAX_I32::~Inst_VOP2__V_MAX_I32()
-    {
-    } // ~Inst_VOP2__V_MAX_I32
-
-    // D.i = max(S0.i, S1.i).
-    void
-    Inst_VOP2__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MIN_U32::Inst_VOP2__V_MIN_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_U32
-
-    Inst_VOP2__V_MIN_U32::~Inst_VOP2__V_MIN_U32()
-    {
-    } // ~Inst_VOP2__V_MIN_U32
-
-    // D.u = min(S0.u, S1.u).
-    void
-    Inst_VOP2__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MAX_U32::Inst_VOP2__V_MAX_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_U32
-
-    Inst_VOP2__V_MAX_U32::~Inst_VOP2__V_MAX_U32()
-    {
-    } // ~Inst_VOP2__V_MAX_U32
-
-    // D.u = max(S0.u, S1.u).
-    void
-    Inst_VOP2__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_LSHRREV_B32::Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshrrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHRREV_B32
-
-    Inst_VOP2__V_LSHRREV_B32::~Inst_VOP2__V_LSHRREV_B32()
-    {
-    } // ~Inst_VOP2__V_LSHRREV_B32
-
-    // D.u = S1.u >> S0.u[4:0].
-    // The vacated bits are set to zero.
-    void
-    Inst_VOP2__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_ASHRREV_I32::Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ashrrev_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ASHRREV_I32
-
-    Inst_VOP2__V_ASHRREV_I32::~Inst_VOP2__V_ASHRREV_I32()
-    {
-    } // ~Inst_VOP2__V_ASHRREV_I32
-
-    // D.i = signext(S1.i) >> S0.i[4:0].
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_VOP2__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_LSHLREV_B32::Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshlrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHLREV_B32
-
-    Inst_VOP2__V_LSHLREV_B32::~Inst_VOP2__V_LSHLREV_B32()
-    {
-    } // ~Inst_VOP2__V_LSHLREV_B32
-
-    // D.u = S1.u << S0.u[4:0].
-    void
-    Inst_VOP2__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and vdst during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(GCN3, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
-                    "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
-                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
-                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_UNUSED,
-                    extData.iFmt_VOP_SDWA.CLAMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_AND_B32::Inst_VOP2__V_AND_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_and_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_AND_B32
-
-    Inst_VOP2__V_AND_B32::~Inst_VOP2__V_AND_B32()
-    {
-    } // ~Inst_VOP2__V_AND_B32
-
-    // D.u = S0.u & S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] & src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_OR_B32::Inst_VOP2__V_OR_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_or_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_OR_B32
-
-    Inst_VOP2__V_OR_B32::~Inst_VOP2__V_OR_B32()
-    {
-    } // ~Inst_VOP2__V_OR_B32
-
-    // D.u = S0.u | S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(GCN3, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
-                    "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
-                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
-                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_UNUSED,
-                    extData.iFmt_VOP_SDWA.CLAMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_sdwa[lane] | src1[lane];
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] | src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_XOR_B32::Inst_VOP2__V_XOR_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_xor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_XOR_B32
-
-    Inst_VOP2__V_XOR_B32::~Inst_VOP2__V_XOR_B32()
-    {
-    } // ~Inst_VOP2__V_XOR_B32
-
-    // D.u = S0.u ^ S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] ^ src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MAC_F32::Inst_VOP2__V_MAC_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mac_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAC);
-    } // Inst_VOP2__V_MAC_F32
-
-    Inst_VOP2__V_MAC_F32::~Inst_VOP2__V_MAC_F32()
-    {
-    } // ~Inst_VOP2__V_MAC_F32
-
-    // D.f = S0.f * S1.f + D.f.
-    void
-    Inst_VOP2__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-        vdst.read();
-
-        if (isDPPInst()) {
-            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src0_dpp.read();
-
-            DPRINTF(GCN3, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BOUND_CTRL: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BOUND_CTRL,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
-                                          vdst[lane]);
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MADMK_F32::Inst_VOP2__V_MADMK_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madmk_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADMK_F32
-
-    Inst_VOP2__V_MADMK_F32::~Inst_VOP2__V_MADMK_F32()
-    {
-    } // ~Inst_VOP2__V_MADMK_F32
-
-    // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.
-    // This opcode cannot use the input/output modifiers.
-    void
-    Inst_VOP2__V_MADMK_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-        VecElemF32 k = extData.imm_f32;
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], k, src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MADAK_F32::Inst_VOP2__V_MADAK_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madak_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADAK_F32
-
-    Inst_VOP2__V_MADAK_F32::~Inst_VOP2__V_MADAK_F32()
-    {
-    } // ~Inst_VOP2__V_MADAK_F32
-
-    // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.
-    // This opcode cannot use input/output modifiers.
-    void
-    Inst_VOP2__V_MADAK_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-        VecElemF32 k = extData.imm_f32;
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], k);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_ADD_U32
-
-    Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()
-    {
-    } // ~Inst_VOP2__V_ADD_U32
-
-    // D.u = S0.u + S1.u;
-    // VCC[threadId] = (S0.u + S1.u >= 0x100000000ULL ? 1 : 0) is an UNSIGNED
-    // overflow or carry-out.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(GCN3, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
-                    "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
-                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
-                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_UNUSED,
-                    extData.iFmt_VOP_SDWA.CLAMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_sdwa[lane] + src1[lane];
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
-                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] + src1[lane];
-                    vcc.setBit(lane, ((VecElemU64)src0[lane]
-                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
-                }
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    }
-
-    Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_SUB_U32
-
-    Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()
-    {
-    } // ~Inst_VOP2__V_SUB_U32
-
-    // D.u = S0.u - S1.u;
-    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    }
-
-    Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_SUBREV_U32
-
-    Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()
-    {
-    } // ~Inst_VOP2__V_SUBREV_U32
-
-    // D.u = S1.u - S0.u;
-    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    }
-
-    Inst_VOP2__V_ADDC_U32::Inst_VOP2__V_ADDC_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_addc_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_ADDC_U32
-
-    Inst_VOP2__V_ADDC_U32::~Inst_VOP2__V_ADDC_U32()
-    {
-    } // ~Inst_VOP2__V_ADDC_U32
-
-    // D.u = S0.u + S1.u + VCC[threadId];
-    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x100000000ULL ? 1 : 0)
-    // is an UNSIGNED overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP2__V_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane]
-                    + bits(vcc.rawData(), lane);
-                vcc.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]
-                        + (VecElemU64)bits(vcc.rawData(), lane, lane))
-                            >= 0x100000000 ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    }
-
-    Inst_VOP2__V_SUBB_U32::Inst_VOP2__V_SUBB_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subb_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_SUBB_U32
-
-    Inst_VOP2__V_SUBB_U32::~Inst_VOP2__V_SUBB_U32()
-    {
-    } // ~Inst_VOP2__V_SUBB_U32
-
-    // D.u = S0.u - S1.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP2__V_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);
-                vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    }
-
-    Inst_VOP2__V_SUBBREV_U32::Inst_VOP2__V_SUBBREV_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subbrev_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_SUBBREV_U32
-
-    Inst_VOP2__V_SUBBREV_U32::~Inst_VOP2__V_SUBBREV_U32()
-    {
-    } // ~Inst_VOP2__V_SUBBREV_U32
-
-    // D.u = S1.u - S0.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP2__V_SUBBREV_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);
-                vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))
-                    > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    }
-
-    Inst_VOP2__V_ADD_F16::Inst_VOP2__V_ADD_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_ADD_F16
-
-    Inst_VOP2__V_ADD_F16::~Inst_VOP2__V_ADD_F16()
-    {
-    } // ~Inst_VOP2__V_ADD_F16
-
-    // D.f16 = S0.f16 + S1.f16.
-    void
-    Inst_VOP2__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP2__V_SUB_F16::Inst_VOP2__V_SUB_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_SUB_F16
-
-    Inst_VOP2__V_SUB_F16::~Inst_VOP2__V_SUB_F16()
-    {
-    } // ~Inst_VOP2__V_SUB_F16
-
-    // D.f16 = S0.f16 - S1.f16.
-    void
-    Inst_VOP2__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP2__V_SUBREV_F16::Inst_VOP2__V_SUBREV_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_SUBREV_F16
-
-    Inst_VOP2__V_SUBREV_F16::~Inst_VOP2__V_SUBREV_F16()
-    {
-    } // ~Inst_VOP2__V_SUBREV_F16
-
-    // D.f16 = S1.f16 - S0.f16.
-    void
-    Inst_VOP2__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP2__V_MUL_F16::Inst_VOP2__V_MUL_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MUL_F16
-
-    Inst_VOP2__V_MUL_F16::~Inst_VOP2__V_MUL_F16()
-    {
-    } // ~Inst_VOP2__V_MUL_F16
-
-    // D.f16 = S0.f16 * S1.f16.
-    void
-    Inst_VOP2__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP2__V_MAC_F16::Inst_VOP2__V_MAC_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mac_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAC);
-    } // Inst_VOP2__V_MAC_F16
-
-    Inst_VOP2__V_MAC_F16::~Inst_VOP2__V_MAC_F16()
-    {
-    } // ~Inst_VOP2__V_MAC_F16
-
-    // D.f16 = S0.f16 * S1.f16 + D.f16.
-    void
-    Inst_VOP2__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP2__V_MADMK_F16::Inst_VOP2__V_MADMK_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madmk_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADMK_F16
-
-    Inst_VOP2__V_MADMK_F16::~Inst_VOP2__V_MADMK_F16()
-    {
-    } // ~Inst_VOP2__V_MADMK_F16
-
-    // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored
-    // in the following literal DWORD.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // modifiers.
-    void
-    Inst_VOP2__V_MADMK_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP2__V_MADAK_F16::Inst_VOP2__V_MADAK_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madak_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADAK_F16
-
-    Inst_VOP2__V_MADAK_F16::~Inst_VOP2__V_MADAK_F16()
-    {
-    } // ~Inst_VOP2__V_MADAK_F16
-
-    // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored
-    // in the following literal DWORD.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // modifiers.
-    void
-    Inst_VOP2__V_MADAK_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP2__V_ADD_U16::Inst_VOP2__V_ADD_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ADD_U16
-
-    Inst_VOP2__V_ADD_U16::~Inst_VOP2__V_ADD_U16()
-    {
-    } // ~Inst_VOP2__V_ADD_U16
-
-    // D.u16 = S0.u16 + S1.u16.
-    void
-    Inst_VOP2__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_SUB_U16::Inst_VOP2__V_SUB_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUB_U16
-
-    Inst_VOP2__V_SUB_U16::~Inst_VOP2__V_SUB_U16()
-    {
-    } // ~Inst_VOP2__V_SUB_U16
-
-    // D.u16 = S0.u16 - S1.u16.
-    void
-    Inst_VOP2__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_SUBREV_U16::Inst_VOP2__V_SUBREV_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUBREV_U16
-
-    Inst_VOP2__V_SUBREV_U16::~Inst_VOP2__V_SUBREV_U16()
-    {
-    } // ~Inst_VOP2__V_SUBREV_U16
-
-    // D.u16 = S1.u16 - S0.u16.
-    void
-    Inst_VOP2__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MUL_LO_U16::Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_lo_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_LO_U16
-
-    Inst_VOP2__V_MUL_LO_U16::~Inst_VOP2__V_MUL_LO_U16()
-    {
-    } // ~Inst_VOP2__V_MUL_LO_U16
-
-    // D.u16 = S0.u16 * S1.u16.
-    void
-    Inst_VOP2__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_LSHLREV_B16::Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshlrev_b16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHLREV_B16
-
-    Inst_VOP2__V_LSHLREV_B16::~Inst_VOP2__V_LSHLREV_B16()
-    {
-    } // ~Inst_VOP2__V_LSHLREV_B16
-
-    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
-    void
-    Inst_VOP2__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_LSHRREV_B16::Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshrrev_b16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHRREV_B16
-
-    Inst_VOP2__V_LSHRREV_B16::~Inst_VOP2__V_LSHRREV_B16()
-    {
-    } // ~Inst_VOP2__V_LSHRREV_B16
-
-    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
-    // The vacated bits are set to zero.
-    void
-    Inst_VOP2__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> src0[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_ASHRREV_I16::Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ashrrev_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ASHRREV_I16
-
-    Inst_VOP2__V_ASHRREV_I16::~Inst_VOP2__V_ASHRREV_I16()
-    {
-    } // ~Inst_VOP2__V_ASHRREV_I16
-
-    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_VOP2__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> src0[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MAX_F16::Inst_VOP2__V_MAX_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MAX_F16
-
-    Inst_VOP2__V_MAX_F16::~Inst_VOP2__V_MAX_F16()
-    {
-    } // ~Inst_VOP2__V_MAX_F16
-
-    // D.f16 = max(S0.f16, S1.f16).
-    void
-    Inst_VOP2__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP2__V_MIN_F16::Inst_VOP2__V_MIN_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MIN_F16
-
-    Inst_VOP2__V_MIN_F16::~Inst_VOP2__V_MIN_F16()
-    {
-    } // ~Inst_VOP2__V_MIN_F16
-
-    // D.f16 = min(S0.f16, S1.f16).
-    void
-    Inst_VOP2__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP2__V_MAX_U16::Inst_VOP2__V_MAX_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_U16
-
-    Inst_VOP2__V_MAX_U16::~Inst_VOP2__V_MAX_U16()
-    {
-    } // ~Inst_VOP2__V_MAX_U16
-
-    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP2__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MAX_I16::Inst_VOP2__V_MAX_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_I16
-
-    Inst_VOP2__V_MAX_I16::~Inst_VOP2__V_MAX_I16()
-    {
-    } // ~Inst_VOP2__V_MAX_I16
-
-    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP2__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MIN_U16::Inst_VOP2__V_MIN_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_U16
-
-    Inst_VOP2__V_MIN_U16::~Inst_VOP2__V_MIN_U16()
-    {
-    } // ~Inst_VOP2__V_MIN_U16
-
-    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP2__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_MIN_I16::Inst_VOP2__V_MIN_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_I16
-
-    Inst_VOP2__V_MIN_I16::~Inst_VOP2__V_MIN_I16()
-    {
-    } // ~Inst_VOP2__V_MIN_I16
-
-    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP2__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP2__V_LDEXP_F16::Inst_VOP2__V_LDEXP_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ldexp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_LDEXP_F16
-
-    Inst_VOP2__V_LDEXP_F16::~Inst_VOP2__V_LDEXP_F16()
-    {
-    } // ~Inst_VOP2__V_LDEXP_F16
-
-    // D.f16 = S0.f16 * (2 ** S1.i16).
-    void
-    Inst_VOP2__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_nop")
-    {
-        setFlag(Nop);
-        setFlag(ALU);
-    } // Inst_VOP1__V_NOP
-
-    Inst_VOP1__V_NOP::~Inst_VOP1__V_NOP()
-    {
-    } // ~Inst_VOP1__V_NOP
-
-    // Do nothing.
-    void
-    Inst_VOP1__V_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_VOP1__V_MOV_B32::Inst_VOP1__V_MOV_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_mov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_MOV_B32
-
-    Inst_VOP1__V_MOV_B32::~Inst_VOP1__V_MOV_B32()
-    {
-    } // ~Inst_VOP1__V_MOV_B32
-
-    // D.u = S0.u.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (isDPPInst()) {
-            VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src_dpp.read();
-
-            DPRINTF(GCN3, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BOUND_CTRL: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BOUND_CTRL,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            // NOTE: For VOP1, there is no SRC1, so make sure we're not trying
-            // to negate it or take the absolute value of it
-            assert(!extData.iFmt_VOP_DPP.SRC1_ABS);
-            assert(!extData.iFmt_VOP_DPP.SRC1_NEG);
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src_dpp);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src_dpp[lane];
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_READFIRSTLANE_B32::Inst_VOP1__V_READFIRSTLANE_B32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_readfirstlane_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_READFIRSTLANE_B32
-
-    Inst_VOP1__V_READFIRSTLANE_B32::~Inst_VOP1__V_READFIRSTLANE_B32()
-    {
-    } // ~Inst_VOP1__V_READFIRSTLANE_B32
-
-    // Copy one VGPR value to one SGPR. D = SGPR destination, S0 = source data
-    // (VGPR# or M0 for lds direct access), Lane# = FindFirst1fromLSB(exec)
-    // (Lane# = 0 if exec is zero). Ignores exec mask for the access.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_READFIRSTLANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarRegI32 src_lane(0);
-        ScalarRegU64 exec_mask = wf->execMask().to_ullong();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (exec_mask) {
-            src_lane = findLsbSet(exec_mask);
-        }
-
-        sdst = src[src_lane];
-
-        sdst.write();
-    }
-
-    Inst_VOP1__V_CVT_I32_F64::Inst_VOP1__V_CVT_I32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_I32_F64
-
-    Inst_VOP1__V_CVT_I32_F64::~Inst_VOP1__V_CVT_I32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_I32_F64
-
-    // D.i = (int)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN
-    // is converted to 0.
-    void
-    Inst_VOP1__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_F64_I32::Inst_VOP1__V_CVT_F64_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_i32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_I32
-
-    Inst_VOP1__V_CVT_F64_I32::~Inst_VOP1__V_CVT_F64_I32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_I32
-
-    // D.d = (double)S0.i.
-    void
-    Inst_VOP1__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_F32_I32::Inst_VOP1__V_CVT_F32_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_i32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_I32
-
-    Inst_VOP1__V_CVT_F32_I32::~Inst_VOP1__V_CVT_F32_I32()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_I32
-
-    // D.f = (float)S0.i.
-    void
-    Inst_VOP1__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_F32_U32::Inst_VOP1__V_CVT_F32_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_u32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_U32
-
-    Inst_VOP1__V_CVT_F32_U32::~Inst_VOP1__V_CVT_F32_U32()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_U32
-
-    // D.f = (float)S0.u.
-    void
-    Inst_VOP1__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_U32_F32::Inst_VOP1__V_CVT_U32_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_U32_F32
-
-    Inst_VOP1__V_CVT_U32_F32::~Inst_VOP1__V_CVT_U32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_U32_F32
-
-    // D.u = (unsigned)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN
-    // is converted to 0.
-    void
-    Inst_VOP1__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_I32_F32::Inst_VOP1__V_CVT_I32_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_I32_F32
-
-    Inst_VOP1__V_CVT_I32_F32::~Inst_VOP1__V_CVT_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_I32_F32
-
-    // D.i = (int)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN
-    // is converted to 0.
-    void
-    Inst_VOP1__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_MOV_FED_B32::Inst_VOP1__V_MOV_FED_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_mov_fed_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_MOV_FED_B32
-
-    Inst_VOP1__V_MOV_FED_B32::~Inst_VOP1__V_MOV_FED_B32()
-    {
-    } // ~Inst_VOP1__V_MOV_FED_B32
-
-    // D.u = S0.u;
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_CVT_F16_F32::Inst_VOP1__V_CVT_F16_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F16_F32
-
-    Inst_VOP1__V_CVT_F16_F32::~Inst_VOP1__V_CVT_F16_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_F32
-
-    // D.f16 = flt32_to_flt16(S0.f).
-    void
-    Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_CVT_F32_F16::Inst_VOP1__V_CVT_F32_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_f16")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_F16
-
-    Inst_VOP1__V_CVT_F32_F16::~Inst_VOP1__V_CVT_F32_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_F16
-
-    // D.f = flt16_to_flt32(S0.f16).
-    void
-    Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_CVT_RPI_I32_F32::Inst_VOP1__V_CVT_RPI_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_rpi_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_RPI_I32_F32
-
-    Inst_VOP1__V_CVT_RPI_I32_F32::~Inst_VOP1__V_CVT_RPI_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_RPI_I32_F32
-
-    // D.i = (int)floor(S0.f + 0.5).
-    void
-    Inst_VOP1__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_FLR_I32_F32::Inst_VOP1__V_CVT_FLR_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_flr_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_FLR_I32_F32
-
-    Inst_VOP1__V_CVT_FLR_I32_F32::~Inst_VOP1__V_CVT_FLR_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_FLR_I32_F32
-
-    // D.i = (int)floor(S0.f).
-    void
-    Inst_VOP1__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_OFF_F32_I4::Inst_VOP1__V_CVT_OFF_F32_I4(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_off_f32_i4")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_OFF_F32_I4
-
-    Inst_VOP1__V_CVT_OFF_F32_I4::~Inst_VOP1__V_CVT_OFF_F32_I4()
-    {
-    } // ~Inst_VOP1__V_CVT_OFF_F32_I4
-
-    // 4-bit signed int to 32-bit float.
-    void
-    Inst_VOP1__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_CVT_F32_F64::Inst_VOP1__V_CVT_F32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F32_F64
-
-    Inst_VOP1__V_CVT_F32_F64::~Inst_VOP1__V_CVT_F32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_F64
-
-    // D.f = (float)S0.d.
-    void
-    Inst_VOP1__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_F64_F32::Inst_VOP1__V_CVT_F64_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_f32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_F32
-
-    Inst_VOP1__V_CVT_F64_F32::~Inst_VOP1__V_CVT_F64_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_F32
-
-    // D.d = (double)S0.f.
-    void
-    Inst_VOP1__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_F32_UBYTE0::Inst_VOP1__V_CVT_F32_UBYTE0(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte0")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE0
-
-    Inst_VOP1__V_CVT_F32_UBYTE0::~Inst_VOP1__V_CVT_F32_UBYTE0()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE0
-
-    // D.f = (float)(S0.u[7:0]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_F32_UBYTE1::Inst_VOP1__V_CVT_F32_UBYTE1(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte1")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE1
-
-    Inst_VOP1__V_CVT_F32_UBYTE1::~Inst_VOP1__V_CVT_F32_UBYTE1()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE1
-
-    // D.f = (float)(S0.u[15:8]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_F32_UBYTE2::Inst_VOP1__V_CVT_F32_UBYTE2(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte2")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE2
-
-    Inst_VOP1__V_CVT_F32_UBYTE2::~Inst_VOP1__V_CVT_F32_UBYTE2()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE2
-
-    // D.f = (float)(S0.u[23:16]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_F32_UBYTE3::Inst_VOP1__V_CVT_F32_UBYTE3(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte3")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE3
-
-    Inst_VOP1__V_CVT_F32_UBYTE3::~Inst_VOP1__V_CVT_F32_UBYTE3()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE3
-
-    // D.f = (float)(S0.u[31:24]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_U32_F64::Inst_VOP1__V_CVT_U32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_U32_F64
-
-    Inst_VOP1__V_CVT_U32_F64::~Inst_VOP1__V_CVT_U32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_U32_F64
-
-    // D.u = (unsigned)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN
-    // is converted to 0.
-    void
-    Inst_VOP1__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CVT_F64_U32::Inst_VOP1__V_CVT_F64_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_u32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_U32
-
-    Inst_VOP1__V_CVT_F64_U32::~Inst_VOP1__V_CVT_F64_U32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_U32
-
-    // D.d = (double)S0.u.
-    void
-    Inst_VOP1__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_TRUNC_F64::Inst_VOP1__V_TRUNC_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_TRUNC_F64
-
-    Inst_VOP1__V_TRUNC_F64::~Inst_VOP1__V_TRUNC_F64()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F64
-
-    // D.d = trunc(S0.d), return integer part of S0.d.
-    void
-    Inst_VOP1__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CEIL_F64::Inst_VOP1__V_CEIL_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CEIL_F64
-
-    Inst_VOP1__V_CEIL_F64::~Inst_VOP1__V_CEIL_F64()
-    {
-    } // ~Inst_VOP1__V_CEIL_F64
-
-    // D.d = ceil(S0.d);
-    void
-    Inst_VOP1__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_RNDNE_F64::Inst_VOP1__V_RNDNE_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RNDNE_F64
-
-    Inst_VOP1__V_RNDNE_F64::~Inst_VOP1__V_RNDNE_F64()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F64
-
-    // D.d = round_nearest_even(S0.d).
-    void
-    Inst_VOP1__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_FLOOR_F64::Inst_VOP1__V_FLOOR_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FLOOR_F64
-
-    Inst_VOP1__V_FLOOR_F64::~Inst_VOP1__V_FLOOR_F64()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F64
-
-    // D.d = floor(S0.d);
-    void
-    Inst_VOP1__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_FRACT_F32::Inst_VOP1__V_FRACT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FRACT_F32
-
-    Inst_VOP1__V_FRACT_F32::~Inst_VOP1__V_FRACT_F32()
-    {
-    } // ~Inst_VOP1__V_FRACT_F32
-
-    // D.f = modf(S0.f).
-    void
-    Inst_VOP1__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_TRUNC_F32::Inst_VOP1__V_TRUNC_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_TRUNC_F32
-
-    Inst_VOP1__V_TRUNC_F32::~Inst_VOP1__V_TRUNC_F32()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F32
-
-    // D.f = trunc(S0.f), return integer part of S0.f.
-    void
-    Inst_VOP1__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst (gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CEIL_F32::Inst_VOP1__V_CEIL_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CEIL_F32
-
-    Inst_VOP1__V_CEIL_F32::~Inst_VOP1__V_CEIL_F32()
-    {
-    } // ~Inst_VOP1__V_CEIL_F32
-
-    // D.f = ceil(S0.f);
-    void
-    Inst_VOP1__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_RNDNE_F32::Inst_VOP1__V_RNDNE_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RNDNE_F32
-
-    Inst_VOP1__V_RNDNE_F32::~Inst_VOP1__V_RNDNE_F32()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F32
-
-    // D.f = round_nearest_even(S0.f).
-    void
-    Inst_VOP1__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_FLOOR_F32::Inst_VOP1__V_FLOOR_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FLOOR_F32
-
-    Inst_VOP1__V_FLOOR_F32::~Inst_VOP1__V_FLOOR_F32()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F32
-
-    // D.f = floor(S0.f);
-    void
-    Inst_VOP1__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_EXP_F32::Inst_VOP1__V_EXP_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_EXP_F32
-
-    Inst_VOP1__V_EXP_F32::~Inst_VOP1__V_EXP_F32()
-    {
-    } // ~Inst_VOP1__V_EXP_F32
-
-    // D.f = pow(2.0, S0.f).
-    void
-    Inst_VOP1__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_LOG_F32::Inst_VOP1__V_LOG_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_LOG_F32
-
-    Inst_VOP1__V_LOG_F32::~Inst_VOP1__V_LOG_F32()
-    {
-    } // ~Inst_VOP1__V_LOG_F32
-
-    // D.f = log2(S0.f).
-    void
-    Inst_VOP1__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_RCP_F32::Inst_VOP1__V_RCP_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RCP_F32
-
-    Inst_VOP1__V_RCP_F32::~Inst_VOP1__V_RCP_F32()
-    {
-    } // ~Inst_VOP1__V_RCP_F32
-
-    // D.f = 1.0 / S0.f.
-    void
-    Inst_VOP1__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_RCP_IFLAG_F32::Inst_VOP1__V_RCP_IFLAG_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_iflag_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RCP_IFLAG_F32
-
-    Inst_VOP1__V_RCP_IFLAG_F32::~Inst_VOP1__V_RCP_IFLAG_F32()
-    {
-    } // ~Inst_VOP1__V_RCP_IFLAG_F32
-
-    // D.f = 1.0 / S0.f.
-    void
-    Inst_VOP1__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_RSQ_F32::Inst_VOP1__V_RSQ_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RSQ_F32
-
-    Inst_VOP1__V_RSQ_F32::~Inst_VOP1__V_RSQ_F32()
-    {
-    } // ~Inst_VOP1__V_RSQ_F32
-
-    // D.f = 1.0 / sqrt(S0.f).
-    void
-    Inst_VOP1__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_RCP_F64::Inst_VOP1__V_RCP_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RCP_F64
-
-    Inst_VOP1__V_RCP_F64::~Inst_VOP1__V_RCP_F64()
-    {
-    } // ~Inst_VOP1__V_RCP_F64
-
-    // D.d = 1.0 / S0.d.
-    void
-    Inst_VOP1__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = 0.0;
-                    }
-                } else {
-                    vdst[lane] = 1.0 / src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_RSQ_F64::Inst_VOP1__V_RSQ_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RSQ_F64
-
-    Inst_VOP1__V_RSQ_F64::~Inst_VOP1__V_RSQ_F64()
-    {
-    } // ~Inst_VOP1__V_RSQ_F64
-
-    // D.d = 1.0 / sqrt(S0.d).
-    void
-    Inst_VOP1__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])
-                           && !std::signbit(src[lane])) {
-                    vdst[lane] = 0.0;
-                } else if (std::signbit(src[lane])) {
-                    vdst[lane] = NAN;
-                } else {
-                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_SQRT_F32::Inst_VOP1__V_SQRT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_SQRT_F32
-
-    Inst_VOP1__V_SQRT_F32::~Inst_VOP1__V_SQRT_F32()
-    {
-    } // ~Inst_VOP1__V_SQRT_F32
-
-    // D.f = sqrt(S0.f).
-    void
-    Inst_VOP1__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_SQRT_F64::Inst_VOP1__V_SQRT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_SQRT_F64
-
-    Inst_VOP1__V_SQRT_F64::~Inst_VOP1__V_SQRT_F64()
-    {
-    } // ~Inst_VOP1__V_SQRT_F64
-
-    // D.d = sqrt(S0.d).
-    void
-    Inst_VOP1__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_SIN_F32::Inst_VOP1__V_SIN_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sin_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_SIN_F32
-
-    Inst_VOP1__V_SIN_F32::~Inst_VOP1__V_SIN_F32()
-    {
-    } // ~Inst_VOP1__V_SIN_F32
-
-    // D.f = sin(S0.f * 2 * PI).
-    void
-    Inst_VOP1__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (src[lane] < -256.0 || src[lane] > 256.0) {
-                    vdst[lane] = 0.0;
-                } else {
-                    vdst[lane] = std::sin(src[lane] * 2.0 * pi.rawData());
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_COS_F32::Inst_VOP1__V_COS_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cos_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_COS_F32
-
-    Inst_VOP1__V_COS_F32::~Inst_VOP1__V_COS_F32()
-    {
-    } // ~Inst_VOP1__V_COS_F32
-
-    // D.f = cos(S0.f * 2 * PI).
-    void
-    Inst_VOP1__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (src[lane] < -256.0 || src[lane] > 256.0) {
-                    vdst[lane] = 0.0;
-                } else {
-                    vdst[lane] = std::cos(src[lane] * 2.0 * pi.rawData());
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_NOT_B32::Inst_VOP1__V_NOT_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_not_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_NOT_B32
-
-    Inst_VOP1__V_NOT_B32::~Inst_VOP1__V_NOT_B32()
-    {
-    } // ~Inst_VOP1__V_NOT_B32
-
-    // D.u = ~S0.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP1__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ~src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_BFREV_B32::Inst_VOP1__V_BFREV_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_bfrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_BFREV_B32
-
-    Inst_VOP1__V_BFREV_B32::~Inst_VOP1__V_BFREV_B32()
-    {
-    } // ~Inst_VOP1__V_BFREV_B32
-
-    // D.u[31:0] = S0.u[0:31], bitfield reverse.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP1__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = reverseBits(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_FFBH_U32::Inst_VOP1__V_FFBH_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbh_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBH_U32
-
-    Inst_VOP1__V_FFBH_U32::~Inst_VOP1__V_FFBH_U32()
-    {
-    } // ~Inst_VOP1__V_FFBH_U32
-
-    // D.u = position of first 1 in S0.u from MSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP1__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOneMsb(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_FFBL_B32::Inst_VOP1__V_FFBL_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbl_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBL_B32
-
-    Inst_VOP1__V_FFBL_B32::~Inst_VOP1__V_FFBL_B32()
-    {
-    } // ~Inst_VOP1__V_FFBL_B32
-
-    // D.u = position of first 1 in S0.u from LSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP1__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOne(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_FFBH_I32::Inst_VOP1__V_FFBH_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbh_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBH_I32
-
-    Inst_VOP1__V_FFBH_I32::~Inst_VOP1__V_FFBH_I32()
-    {
-    } // ~Inst_VOP1__V_FFBH_I32
-
-    // D.u = position of first bit different from sign bit in S0.i from MSB;
-    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
-    void
-    Inst_VOP1__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = firstOppositeSignBit(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_FREXP_EXP_I32_F64::Inst_VOP1__V_FREXP_EXP_I32_F64(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FREXP_EXP_I32_F64
-
-    Inst_VOP1__V_FREXP_EXP_I32_F64::~Inst_VOP1__V_FREXP_EXP_I32_F64()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I32_F64
-
-    void
-    Inst_VOP1__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp = 0;
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_FREXP_MANT_F64::Inst_VOP1__V_FREXP_MANT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FREXP_MANT_F64
-
-    Inst_VOP1__V_FREXP_MANT_F64::~Inst_VOP1__V_FREXP_MANT_F64()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F64
-
-    void
-    Inst_VOP1__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_FRACT_F64::Inst_VOP1__V_FRACT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FRACT_F64
-
-    Inst_VOP1__V_FRACT_F64::~Inst_VOP1__V_FRACT_F64()
-    {
-    } // ~Inst_VOP1__V_FRACT_F64
-
-    void
-    Inst_VOP1__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF64 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_FREXP_EXP_I32_F32::Inst_VOP1__V_FREXP_EXP_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FREXP_EXP_I32_F32
-
-    Inst_VOP1__V_FREXP_EXP_I32_F32::~Inst_VOP1__V_FREXP_EXP_I32_F32()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I32_F32
-
-    // frexp(S0.f, Exponent(S0.f))
-    // if (S0.f == INF || S0.f == NAN) then D.i = 0;
-    // else D.i = Exponent(S0.f);
-    void
-    Inst_VOP1__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_FREXP_MANT_F32::Inst_VOP1__V_FREXP_MANT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FREXP_MANT_F32
-
-    Inst_VOP1__V_FREXP_MANT_F32::~Inst_VOP1__V_FREXP_MANT_F32()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F32
-
-    // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
-    // else D.f = frexp(S0.f, Exponent(S0.f)).
-    void
-    Inst_VOP1__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_CLREXCP::Inst_VOP1__V_CLREXCP(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_clrexcp")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_CLREXCP
-
-    Inst_VOP1__V_CLREXCP::~Inst_VOP1__V_CLREXCP()
-    {
-    } // ~Inst_VOP1__V_CLREXCP
-
-    void
-    Inst_VOP1__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_CVT_F16_U16::Inst_VOP1__V_CVT_F16_U16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_u16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_F16_U16
-
-    Inst_VOP1__V_CVT_F16_U16::~Inst_VOP1__V_CVT_F16_U16()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_U16
-
-    // D.f16 = uint16_to_flt16(S.u16).
-    void
-    Inst_VOP1__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_CVT_F16_I16::Inst_VOP1__V_CVT_F16_I16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_i16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_F16_I16
-
-    Inst_VOP1__V_CVT_F16_I16::~Inst_VOP1__V_CVT_F16_I16()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_I16
-
-    // D.f16 = int16_to_flt16(S.i16).
-    void
-    Inst_VOP1__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_CVT_U16_F16::Inst_VOP1__V_CVT_U16_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_U16_F16
-
-    Inst_VOP1__V_CVT_U16_F16::~Inst_VOP1__V_CVT_U16_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_U16_F16
-
-    // D.u16 = flt16_to_uint16(S.f16).
-    void
-    Inst_VOP1__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_CVT_I16_F16::Inst_VOP1__V_CVT_I16_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_I16_F16
-
-    Inst_VOP1__V_CVT_I16_F16::~Inst_VOP1__V_CVT_I16_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_I16_F16
-
-    // D.i16 = flt16_to_int16(S.f16).
-    void
-    Inst_VOP1__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_RCP_F16::Inst_VOP1__V_RCP_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RCP_F16
-
-    Inst_VOP1__V_RCP_F16::~Inst_VOP1__V_RCP_F16()
-    {
-    } // ~Inst_VOP1__V_RCP_F16
-
-    // if (S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = 1 / S0.f16;
-    void
-    Inst_VOP1__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_SQRT_F16::Inst_VOP1__V_SQRT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_SQRT_F16
-
-    Inst_VOP1__V_SQRT_F16::~Inst_VOP1__V_SQRT_F16()
-    {
-    } // ~Inst_VOP1__V_SQRT_F16
-
-    // if (S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = sqrt(S0.f16);
-    void
-    Inst_VOP1__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_RSQ_F16::Inst_VOP1__V_RSQ_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RSQ_F16
-
-    Inst_VOP1__V_RSQ_F16::~Inst_VOP1__V_RSQ_F16()
-    {
-    } // ~Inst_VOP1__V_RSQ_F16
-
-    // if (S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = 1 / sqrt(S0.f16);
-    void
-    Inst_VOP1__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_LOG_F16::Inst_VOP1__V_LOG_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_LOG_F16
-
-    Inst_VOP1__V_LOG_F16::~Inst_VOP1__V_LOG_F16()
-    {
-    } // ~Inst_VOP1__V_LOG_F16
-
-    // if (S0.f16 == 1.0f)
-    //     D.f16 = 0.0f;
-    // else
-    //     D.f16 = log2(S0.f16);
-    void
-    Inst_VOP1__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_EXP_F16::Inst_VOP1__V_EXP_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_EXP_F16
-
-    Inst_VOP1__V_EXP_F16::~Inst_VOP1__V_EXP_F16()
-    {
-    } // ~Inst_VOP1__V_EXP_F16
-
-    // if (S0.f16 == 0.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = pow(2.0, S0.f16).
-    void
-    Inst_VOP1__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_FREXP_MANT_F16::Inst_VOP1__V_FREXP_MANT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FREXP_MANT_F16
-
-    Inst_VOP1__V_FREXP_MANT_F16::~Inst_VOP1__V_FREXP_MANT_F16()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F16
-
-    // if (S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.f16 = S0.f16;
-    // else
-    //     D.f16 = mantissa(S0.f16).
-    void
-    Inst_VOP1__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_FREXP_EXP_I16_F16::Inst_VOP1__V_FREXP_EXP_I16_F16(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FREXP_EXP_I16_F16
-
-    Inst_VOP1__V_FREXP_EXP_I16_F16::~Inst_VOP1__V_FREXP_EXP_I16_F16()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I16_F16
-
-    // frexp(S0.f16, Exponent(S0.f16))
-    // if (S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.i16 = 0;
-    // else
-    //     D.i16 = Exponent(S0.f16);
-    void
-    Inst_VOP1__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_FLOOR_F16::Inst_VOP1__V_FLOOR_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FLOOR_F16
-
-    Inst_VOP1__V_FLOOR_F16::~Inst_VOP1__V_FLOOR_F16()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F16
-
-    // D.f16 = floor(S0.f16);
-    void
-    Inst_VOP1__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_CEIL_F16::Inst_VOP1__V_CEIL_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CEIL_F16
-
-    Inst_VOP1__V_CEIL_F16::~Inst_VOP1__V_CEIL_F16()
-    {
-    } // ~Inst_VOP1__V_CEIL_F16
-
-    // D.f16 = ceil(S0.f16);
-    void
-    Inst_VOP1__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_TRUNC_F16::Inst_VOP1__V_TRUNC_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_TRUNC_F16
-
-    Inst_VOP1__V_TRUNC_F16::~Inst_VOP1__V_TRUNC_F16()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F16
-
-    // D.f16 = trunc(S0.f16).
-    void
-    Inst_VOP1__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_RNDNE_F16::Inst_VOP1__V_RNDNE_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RNDNE_F16
-
-    Inst_VOP1__V_RNDNE_F16::~Inst_VOP1__V_RNDNE_F16()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F16
-
-    // D.f16 = roundNearestEven(S0.f16);
-    void
-    Inst_VOP1__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_FRACT_F16::Inst_VOP1__V_FRACT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FRACT_F16
-
-    Inst_VOP1__V_FRACT_F16::~Inst_VOP1__V_FRACT_F16()
-    {
-    } // ~Inst_VOP1__V_FRACT_F16
-
-    // D.f16 = S0.f16 + -floor(S0.f16).
-    void
-    Inst_VOP1__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_SIN_F16::Inst_VOP1__V_SIN_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sin_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_SIN_F16
-
-    Inst_VOP1__V_SIN_F16::~Inst_VOP1__V_SIN_F16()
-    {
-    } // ~Inst_VOP1__V_SIN_F16
-
-    // D.f16 = sin(S0.f16 * 2 * PI).
-    void
-    Inst_VOP1__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_COS_F16::Inst_VOP1__V_COS_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cos_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_COS_F16
-
-    Inst_VOP1__V_COS_F16::~Inst_VOP1__V_COS_F16()
-    {
-    } // ~Inst_VOP1__V_COS_F16
-
-    // D.f16 = cos(S0.f16 * 2 * PI).
-    void
-    Inst_VOP1__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP1__V_EXP_LEGACY_F32::Inst_VOP1__V_EXP_LEGACY_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_EXP_LEGACY_F32
-
-    Inst_VOP1__V_EXP_LEGACY_F32::~Inst_VOP1__V_EXP_LEGACY_F32()
-    {
-    } // ~Inst_VOP1__V_EXP_LEGACY_F32
-
-    // D.f = pow(2.0, S0.f)
-    void
-    Inst_VOP1__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP1__V_LOG_LEGACY_F32::Inst_VOP1__V_LOG_LEGACY_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_LOG_LEGACY_F32
-
-    Inst_VOP1__V_LOG_LEGACY_F32::~Inst_VOP1__V_LOG_LEGACY_F32()
-    {
-    } // ~Inst_VOP1__V_LOG_LEGACY_F32
-
-    // D.f = log2(S0.f).
-    void
-    Inst_VOP1__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOPC__V_CMP_CLASS_F32::Inst_VOPC__V_CMP_CLASS_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_CLASS_F32
-
-    Inst_VOPC__V_CMP_CLASS_F32::~Inst_VOPC__V_CMP_CLASS_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F32
-
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
-    // The function reports true if the floating point value is any of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_CLASS_F32::Inst_VOPC__V_CMPX_CLASS_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_CLASS_F32
-
-    Inst_VOPC__V_CMPX_CLASS_F32::~Inst_VOPC__V_CMPX_CLASS_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F32
-
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.f The function reports true if the floating point value is any of
-    // the numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMP_CLASS_F64::Inst_VOPC__V_CMP_CLASS_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_CLASS_F64
-
-    Inst_VOPC__V_CMP_CLASS_F64::~Inst_VOPC__V_CMP_CLASS_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F64
-
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
-    // The function reports true if the floating point value is any of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_CLASS_F64::Inst_VOPC__V_CMPX_CLASS_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_CLASS_F64
-
-    Inst_VOPC__V_CMPX_CLASS_F64::~Inst_VOPC__V_CMPX_CLASS_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F64
-
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.d The function reports true if the floating point value is any of
-    // the numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMP_CLASS_F16::Inst_VOPC__V_CMP_CLASS_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_CLASS_F16
-
-    Inst_VOPC__V_CMP_CLASS_F16::~Inst_VOPC__V_CMP_CLASS_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F16
-
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
-    // The function reports true if the floating point value is any of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_CLASS_F16::Inst_VOPC__V_CMPX_CLASS_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_CLASS_F16
-
-    Inst_VOPC__V_CMPX_CLASS_F16::~Inst_VOPC__V_CMPX_CLASS_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F16
-
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.f16
-    // The function reports true if the floating point value is any of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_F_F16::Inst_VOPC__V_CMP_F_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_F_F16
-
-    Inst_VOPC__V_CMP_F_F16::~Inst_VOPC__V_CMP_F_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F16
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_LT_F16::Inst_VOPC__V_CMP_LT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LT_F16
-
-    Inst_VOPC__V_CMP_LT_F16::~Inst_VOPC__V_CMP_LT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F16
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_EQ_F16::Inst_VOPC__V_CMP_EQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_EQ_F16
-
-    Inst_VOPC__V_CMP_EQ_F16::~Inst_VOPC__V_CMP_EQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F16
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_LE_F16::Inst_VOPC__V_CMP_LE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LE_F16
-
-    Inst_VOPC__V_CMP_LE_F16::~Inst_VOPC__V_CMP_LE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F16
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_GT_F16::Inst_VOPC__V_CMP_GT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_GT_F16
-
-    Inst_VOPC__V_CMP_GT_F16::~Inst_VOPC__V_CMP_GT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F16
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_LG_F16::Inst_VOPC__V_CMP_LG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LG_F16
-
-    Inst_VOPC__V_CMP_LG_F16::~Inst_VOPC__V_CMP_LG_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F16
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_GE_F16::Inst_VOPC__V_CMP_GE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_GE_F16
-
-    Inst_VOPC__V_CMP_GE_F16::~Inst_VOPC__V_CMP_GE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F16
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_O_F16::Inst_VOPC__V_CMP_O_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_O_F16
-
-    Inst_VOPC__V_CMP_O_F16::~Inst_VOPC__V_CMP_O_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F16
-
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_U_F16::Inst_VOPC__V_CMP_U_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_U_F16
-
-    Inst_VOPC__V_CMP_U_F16::~Inst_VOPC__V_CMP_U_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F16
-
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_NGE_F16::Inst_VOPC__V_CMP_NGE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NGE_F16
-
-    Inst_VOPC__V_CMP_NGE_F16::~Inst_VOPC__V_CMP_NGE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F16
-
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_NLG_F16::Inst_VOPC__V_CMP_NLG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLG_F16
-
-    Inst_VOPC__V_CMP_NLG_F16::~Inst_VOPC__V_CMP_NLG_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F16
-
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_NGT_F16::Inst_VOPC__V_CMP_NGT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NGT_F16
-
-    Inst_VOPC__V_CMP_NGT_F16::~Inst_VOPC__V_CMP_NGT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F16
-
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_NLE_F16::Inst_VOPC__V_CMP_NLE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLE_F16
-
-    Inst_VOPC__V_CMP_NLE_F16::~Inst_VOPC__V_CMP_NLE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F16
-
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_NEQ_F16::Inst_VOPC__V_CMP_NEQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NEQ_F16
-
-    Inst_VOPC__V_CMP_NEQ_F16::~Inst_VOPC__V_CMP_NEQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F16
-
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_NLT_F16::Inst_VOPC__V_CMP_NLT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLT_F16
-
-    Inst_VOPC__V_CMP_NLT_F16::~Inst_VOPC__V_CMP_NLT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F16
-
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_TRU_F16::Inst_VOPC__V_CMP_TRU_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_TRU_F16
-
-    Inst_VOPC__V_CMP_TRU_F16::~Inst_VOPC__V_CMP_TRU_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F16
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_F_F16::Inst_VOPC__V_CMPX_F_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_F_F16
-
-    Inst_VOPC__V_CMPX_F_F16::~Inst_VOPC__V_CMPX_F_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F16
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_LT_F16::Inst_VOPC__V_CMPX_LT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_LT_F16
-
-    Inst_VOPC__V_CMPX_LT_F16::~Inst_VOPC__V_CMPX_LT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F16
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_EQ_F16::Inst_VOPC__V_CMPX_EQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_EQ_F16
-
-    Inst_VOPC__V_CMPX_EQ_F16::~Inst_VOPC__V_CMPX_EQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F16
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_LE_F16::Inst_VOPC__V_CMPX_LE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_LE_F16
-
-    Inst_VOPC__V_CMPX_LE_F16::~Inst_VOPC__V_CMPX_LE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F16
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_GT_F16::Inst_VOPC__V_CMPX_GT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_GT_F16
-
-    Inst_VOPC__V_CMPX_GT_F16::~Inst_VOPC__V_CMPX_GT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F16
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_LG_F16::Inst_VOPC__V_CMPX_LG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_LG_F16
-
-    Inst_VOPC__V_CMPX_LG_F16::~Inst_VOPC__V_CMPX_LG_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F16
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_GE_F16::Inst_VOPC__V_CMPX_GE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_GE_F16
-
-    Inst_VOPC__V_CMPX_GE_F16::~Inst_VOPC__V_CMPX_GE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F16
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_O_F16::Inst_VOPC__V_CMPX_O_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_O_F16
-
-    Inst_VOPC__V_CMPX_O_F16::~Inst_VOPC__V_CMPX_O_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F16
-
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_U_F16::Inst_VOPC__V_CMPX_U_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_U_F16
-
-    Inst_VOPC__V_CMPX_U_F16::~Inst_VOPC__V_CMPX_U_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F16
-
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_NGE_F16::Inst_VOPC__V_CMPX_NGE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_NGE_F16
-
-    Inst_VOPC__V_CMPX_NGE_F16::~Inst_VOPC__V_CMPX_NGE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F16
-
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_NLG_F16::Inst_VOPC__V_CMPX_NLG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_NLG_F16
-
-    Inst_VOPC__V_CMPX_NLG_F16::~Inst_VOPC__V_CMPX_NLG_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F16
-
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_NGT_F16::Inst_VOPC__V_CMPX_NGT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_NGT_F16
-
-    Inst_VOPC__V_CMPX_NGT_F16::~Inst_VOPC__V_CMPX_NGT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F16
-
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_NLE_F16::Inst_VOPC__V_CMPX_NLE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_NLE_F16
-
-    Inst_VOPC__V_CMPX_NLE_F16::~Inst_VOPC__V_CMPX_NLE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F16
-
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_NEQ_F16::Inst_VOPC__V_CMPX_NEQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_NEQ_F16
-
-    Inst_VOPC__V_CMPX_NEQ_F16::~Inst_VOPC__V_CMPX_NEQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F16
-
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_NLT_F16::Inst_VOPC__V_CMPX_NLT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_NLT_F16
-
-    Inst_VOPC__V_CMPX_NLT_F16::~Inst_VOPC__V_CMPX_NLT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F16
-
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMPX_TRU_F16::Inst_VOPC__V_CMPX_TRU_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMPX_TRU_F16
-
-    Inst_VOPC__V_CMPX_TRU_F16::~Inst_VOPC__V_CMPX_TRU_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F16
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOPC__V_CMP_F_F32::Inst_VOPC__V_CMP_F_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_F_F32
-
-    Inst_VOPC__V_CMP_F_F32::~Inst_VOPC__V_CMP_F_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F32
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LT_F32::Inst_VOPC__V_CMP_LT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LT_F32
-
-    Inst_VOPC__V_CMP_LT_F32::~Inst_VOPC__V_CMP_LT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F32
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_EQ_F32::Inst_VOPC__V_CMP_EQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_EQ_F32
-
-    Inst_VOPC__V_CMP_EQ_F32::~Inst_VOPC__V_CMP_EQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F32
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LE_F32::Inst_VOPC__V_CMP_LE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LE_F32
-
-    Inst_VOPC__V_CMP_LE_F32::~Inst_VOPC__V_CMP_LE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F32
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GT_F32::Inst_VOPC__V_CMP_GT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_GT_F32
-
-    Inst_VOPC__V_CMP_GT_F32::~Inst_VOPC__V_CMP_GT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F32
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LG_F32::Inst_VOPC__V_CMP_LG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LG_F32
-
-    Inst_VOPC__V_CMP_LG_F32::~Inst_VOPC__V_CMP_LG_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F32
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GE_F32::Inst_VOPC__V_CMP_GE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_GE_F32
-
-    Inst_VOPC__V_CMP_GE_F32::~Inst_VOPC__V_CMP_GE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F32
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_O_F32::Inst_VOPC__V_CMP_O_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_O_F32
-
-    Inst_VOPC__V_CMP_O_F32::~Inst_VOPC__V_CMP_O_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F32
-
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_U_F32::Inst_VOPC__V_CMP_U_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_U_F32
-
-    Inst_VOPC__V_CMP_U_F32::~Inst_VOPC__V_CMP_U_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F32
-
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NGE_F32::Inst_VOPC__V_CMP_NGE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NGE_F32
-
-    Inst_VOPC__V_CMP_NGE_F32::~Inst_VOPC__V_CMP_NGE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F32
-
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NLG_F32::Inst_VOPC__V_CMP_NLG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLG_F32
-
-    Inst_VOPC__V_CMP_NLG_F32::~Inst_VOPC__V_CMP_NLG_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F32
-
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NGT_F32::Inst_VOPC__V_CMP_NGT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NGT_F32
-
-    Inst_VOPC__V_CMP_NGT_F32::~Inst_VOPC__V_CMP_NGT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F32
-
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NLE_F32::Inst_VOPC__V_CMP_NLE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLE_F32
-
-    Inst_VOPC__V_CMP_NLE_F32::~Inst_VOPC__V_CMP_NLE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F32
-
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NEQ_F32::Inst_VOPC__V_CMP_NEQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NEQ_F32
-
-    Inst_VOPC__V_CMP_NEQ_F32::~Inst_VOPC__V_CMP_NEQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F32
-
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NLT_F32::Inst_VOPC__V_CMP_NLT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLT_F32
-
-    Inst_VOPC__V_CMP_NLT_F32::~Inst_VOPC__V_CMP_NLT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F32
-
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_TRU_F32::Inst_VOPC__V_CMP_TRU_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_TRU_F32
-
-    Inst_VOPC__V_CMP_TRU_F32::~Inst_VOPC__V_CMP_TRU_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F32
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_F_F32::Inst_VOPC__V_CMPX_F_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_F_F32
-
-    Inst_VOPC__V_CMPX_F_F32::~Inst_VOPC__V_CMPX_F_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F32
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_LT_F32::Inst_VOPC__V_CMPX_LT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_LT_F32
-
-    Inst_VOPC__V_CMPX_LT_F32::~Inst_VOPC__V_CMPX_LT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F32
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_EQ_F32::Inst_VOPC__V_CMPX_EQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_EQ_F32
-
-    Inst_VOPC__V_CMPX_EQ_F32::~Inst_VOPC__V_CMPX_EQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F32
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_LE_F32::Inst_VOPC__V_CMPX_LE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_LE_F32
-
-    Inst_VOPC__V_CMPX_LE_F32::~Inst_VOPC__V_CMPX_LE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F32
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_GT_F32::Inst_VOPC__V_CMPX_GT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_GT_F32
-
-    Inst_VOPC__V_CMPX_GT_F32::~Inst_VOPC__V_CMPX_GT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F32
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_LG_F32::Inst_VOPC__V_CMPX_LG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_LG_F32
-
-    Inst_VOPC__V_CMPX_LG_F32::~Inst_VOPC__V_CMPX_LG_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F32
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_GE_F32::Inst_VOPC__V_CMPX_GE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_GE_F32
-
-    Inst_VOPC__V_CMPX_GE_F32::~Inst_VOPC__V_CMPX_GE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F32
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_O_F32::Inst_VOPC__V_CMPX_O_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_O_F32
-
-    Inst_VOPC__V_CMPX_O_F32::~Inst_VOPC__V_CMPX_O_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F32
-
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_U_F32::Inst_VOPC__V_CMPX_U_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_U_F32
-
-    Inst_VOPC__V_CMPX_U_F32::~Inst_VOPC__V_CMPX_U_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F32
-
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_NGE_F32::Inst_VOPC__V_CMPX_NGE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_NGE_F32
-
-    Inst_VOPC__V_CMPX_NGE_F32::~Inst_VOPC__V_CMPX_NGE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F32
-
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_NLG_F32::Inst_VOPC__V_CMPX_NLG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_NLG_F32
-
-    Inst_VOPC__V_CMPX_NLG_F32::~Inst_VOPC__V_CMPX_NLG_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F32
-
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_NGT_F32::Inst_VOPC__V_CMPX_NGT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_NGT_F32
-
-    Inst_VOPC__V_CMPX_NGT_F32::~Inst_VOPC__V_CMPX_NGT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F32
-
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_NLE_F32::Inst_VOPC__V_CMPX_NLE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_NLE_F32
-
-    Inst_VOPC__V_CMPX_NLE_F32::~Inst_VOPC__V_CMPX_NLE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F32
-
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_NEQ_F32::Inst_VOPC__V_CMPX_NEQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_NEQ_F32
-
-    Inst_VOPC__V_CMPX_NEQ_F32::~Inst_VOPC__V_CMPX_NEQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F32
-
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] == src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NLT_F32::Inst_VOPC__V_CMPX_NLT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_NLT_F32
-
-    Inst_VOPC__V_CMPX_NLT_F32::~Inst_VOPC__V_CMPX_NLT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F32
-
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_TRU_F32::Inst_VOPC__V_CMPX_TRU_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMPX_TRU_F32
-
-    Inst_VOPC__V_CMPX_TRU_F32::~Inst_VOPC__V_CMPX_TRU_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F32
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMP_F_F64::Inst_VOPC__V_CMP_F_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_F_F64
-
-    Inst_VOPC__V_CMP_F_F64::~Inst_VOPC__V_CMP_F_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F64
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LT_F64::Inst_VOPC__V_CMP_LT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LT_F64
-
-    Inst_VOPC__V_CMP_LT_F64::~Inst_VOPC__V_CMP_LT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F64
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_EQ_F64::Inst_VOPC__V_CMP_EQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_EQ_F64
-
-    Inst_VOPC__V_CMP_EQ_F64::~Inst_VOPC__V_CMP_EQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F64
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LE_F64::Inst_VOPC__V_CMP_LE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LE_F64
-
-    Inst_VOPC__V_CMP_LE_F64::~Inst_VOPC__V_CMP_LE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F64
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GT_F64::Inst_VOPC__V_CMP_GT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_GT_F64
-
-    Inst_VOPC__V_CMP_GT_F64::~Inst_VOPC__V_CMP_GT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F64
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LG_F64::Inst_VOPC__V_CMP_LG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LG_F64
-
-    Inst_VOPC__V_CMP_LG_F64::~Inst_VOPC__V_CMP_LG_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F64
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GE_F64::Inst_VOPC__V_CMP_GE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_GE_F64
-
-    Inst_VOPC__V_CMP_GE_F64::~Inst_VOPC__V_CMP_GE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F64
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_O_F64::Inst_VOPC__V_CMP_O_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_O_F64
-
-    Inst_VOPC__V_CMP_O_F64::~Inst_VOPC__V_CMP_O_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F64
-
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_U_F64::Inst_VOPC__V_CMP_U_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_U_F64
-
-    Inst_VOPC__V_CMP_U_F64::~Inst_VOPC__V_CMP_U_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F64
-
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NGE_F64::Inst_VOPC__V_CMP_NGE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NGE_F64
-
-    Inst_VOPC__V_CMP_NGE_F64::~Inst_VOPC__V_CMP_NGE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F64
-
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NLG_F64::Inst_VOPC__V_CMP_NLG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLG_F64
-
-    Inst_VOPC__V_CMP_NLG_F64::~Inst_VOPC__V_CMP_NLG_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F64
-
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NGT_F64::Inst_VOPC__V_CMP_NGT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NGT_F64
-
-    Inst_VOPC__V_CMP_NGT_F64::~Inst_VOPC__V_CMP_NGT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F64
-
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NLE_F64::Inst_VOPC__V_CMP_NLE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLE_F64
-
-    Inst_VOPC__V_CMP_NLE_F64::~Inst_VOPC__V_CMP_NLE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F64
-
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NEQ_F64::Inst_VOPC__V_CMP_NEQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NEQ_F64
-
-    Inst_VOPC__V_CMP_NEQ_F64::~Inst_VOPC__V_CMP_NEQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F64
-
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NLT_F64::Inst_VOPC__V_CMP_NLT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLT_F64
-
-    Inst_VOPC__V_CMP_NLT_F64::~Inst_VOPC__V_CMP_NLT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F64
-
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_TRU_F64::Inst_VOPC__V_CMP_TRU_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_TRU_F64
-
-    Inst_VOPC__V_CMP_TRU_F64::~Inst_VOPC__V_CMP_TRU_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F64
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_F_F64::Inst_VOPC__V_CMPX_F_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_F_F64
-
-    Inst_VOPC__V_CMPX_F_F64::~Inst_VOPC__V_CMPX_F_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F64
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_LT_F64::Inst_VOPC__V_CMPX_LT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_LT_F64
-
-    Inst_VOPC__V_CMPX_LT_F64::~Inst_VOPC__V_CMPX_LT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F64
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_EQ_F64::Inst_VOPC__V_CMPX_EQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_EQ_F64
-
-    Inst_VOPC__V_CMPX_EQ_F64::~Inst_VOPC__V_CMPX_EQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F64
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    }
-
-    Inst_VOPC__V_CMPX_LE_F64::Inst_VOPC__V_CMPX_LE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_LE_F64
-
-    Inst_VOPC__V_CMPX_LE_F64::~Inst_VOPC__V_CMPX_LE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F64
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GT_F64::Inst_VOPC__V_CMPX_GT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_GT_F64
-
-    Inst_VOPC__V_CMPX_GT_F64::~Inst_VOPC__V_CMPX_GT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F64
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LG_F64::Inst_VOPC__V_CMPX_LG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_LG_F64
-
-    Inst_VOPC__V_CMPX_LG_F64::~Inst_VOPC__V_CMPX_LG_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F64
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GE_F64::Inst_VOPC__V_CMPX_GE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_GE_F64
-
-    Inst_VOPC__V_CMPX_GE_F64::~Inst_VOPC__V_CMPX_GE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F64
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_O_F64::Inst_VOPC__V_CMPX_O_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_O_F64
-
-    Inst_VOPC__V_CMPX_O_F64::~Inst_VOPC__V_CMPX_O_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F64
-
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_U_F64::Inst_VOPC__V_CMPX_U_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_U_F64
-
-    Inst_VOPC__V_CMPX_U_F64::~Inst_VOPC__V_CMPX_U_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F64
-
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NGE_F64::Inst_VOPC__V_CMPX_NGE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_NGE_F64
-
-    Inst_VOPC__V_CMPX_NGE_F64::~Inst_VOPC__V_CMPX_NGE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F64
-
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NLG_F64::Inst_VOPC__V_CMPX_NLG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_NLG_F64
-
-    Inst_VOPC__V_CMPX_NLG_F64::~Inst_VOPC__V_CMPX_NLG_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F64
-
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NGT_F64::Inst_VOPC__V_CMPX_NGT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_NGT_F64
-
-    Inst_VOPC__V_CMPX_NGT_F64::~Inst_VOPC__V_CMPX_NGT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F64
-
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NLE_F64::Inst_VOPC__V_CMPX_NLE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_NLE_F64
-
-    Inst_VOPC__V_CMPX_NLE_F64::~Inst_VOPC__V_CMPX_NLE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F64
-
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NEQ_F64::Inst_VOPC__V_CMPX_NEQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_NEQ_F64
-
-    Inst_VOPC__V_CMPX_NEQ_F64::~Inst_VOPC__V_CMPX_NEQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F64
-
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NLT_F64::Inst_VOPC__V_CMPX_NLT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_NLT_F64
-
-    Inst_VOPC__V_CMPX_NLT_F64::~Inst_VOPC__V_CMPX_NLT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F64
-
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_TRU_F64::Inst_VOPC__V_CMPX_TRU_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMPX_TRU_F64
-
-    Inst_VOPC__V_CMPX_TRU_F64::~Inst_VOPC__V_CMPX_TRU_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F64
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_F_I16::Inst_VOPC__V_CMP_F_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I16
-
-    Inst_VOPC__V_CMP_F_I16::~Inst_VOPC__V_CMP_F_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I16
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LT_I16::Inst_VOPC__V_CMP_LT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I16
-
-    Inst_VOPC__V_CMP_LT_I16::~Inst_VOPC__V_CMP_LT_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I16
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_EQ_I16::Inst_VOPC__V_CMP_EQ_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I16
-
-    Inst_VOPC__V_CMP_EQ_I16::~Inst_VOPC__V_CMP_EQ_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I16
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LE_I16::Inst_VOPC__V_CMP_LE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I16
-
-    Inst_VOPC__V_CMP_LE_I16::~Inst_VOPC__V_CMP_LE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I16
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GT_I16::Inst_VOPC__V_CMP_GT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I16
-
-    Inst_VOPC__V_CMP_GT_I16::~Inst_VOPC__V_CMP_GT_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I16
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NE_I16::Inst_VOPC__V_CMP_NE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I16
-
-    Inst_VOPC__V_CMP_NE_I16::~Inst_VOPC__V_CMP_NE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I16
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GE_I16::Inst_VOPC__V_CMP_GE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I16
-
-    Inst_VOPC__V_CMP_GE_I16::~Inst_VOPC__V_CMP_GE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I16
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_T_I16::Inst_VOPC__V_CMP_T_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I16
-
-    Inst_VOPC__V_CMP_T_I16::~Inst_VOPC__V_CMP_T_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I16
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_F_U16::Inst_VOPC__V_CMP_F_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U16
-
-    Inst_VOPC__V_CMP_F_U16::~Inst_VOPC__V_CMP_F_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U16
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LT_U16::Inst_VOPC__V_CMP_LT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U16
-
-    Inst_VOPC__V_CMP_LT_U16::~Inst_VOPC__V_CMP_LT_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U16
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_EQ_U16::Inst_VOPC__V_CMP_EQ_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U16
-
-    Inst_VOPC__V_CMP_EQ_U16::~Inst_VOPC__V_CMP_EQ_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U16
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LE_U16::Inst_VOPC__V_CMP_LE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U16
-
-    Inst_VOPC__V_CMP_LE_U16::~Inst_VOPC__V_CMP_LE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U16
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GT_U16::Inst_VOPC__V_CMP_GT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U16
-
-    Inst_VOPC__V_CMP_GT_U16::~Inst_VOPC__V_CMP_GT_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U16
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NE_U16::Inst_VOPC__V_CMP_NE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U16
-
-    Inst_VOPC__V_CMP_NE_U16::~Inst_VOPC__V_CMP_NE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U16
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GE_U16::Inst_VOPC__V_CMP_GE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U16
-
-    Inst_VOPC__V_CMP_GE_U16::~Inst_VOPC__V_CMP_GE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U16
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_T_U16::Inst_VOPC__V_CMP_T_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U16
-
-    Inst_VOPC__V_CMP_T_U16::~Inst_VOPC__V_CMP_T_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U16
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_F_I16::Inst_VOPC__V_CMPX_F_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_F_I16
-
-    Inst_VOPC__V_CMPX_F_I16::~Inst_VOPC__V_CMPX_F_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I16
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LT_I16::Inst_VOPC__V_CMPX_LT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LT_I16
-
-    Inst_VOPC__V_CMPX_LT_I16::~Inst_VOPC__V_CMPX_LT_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I16
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_EQ_I16::Inst_VOPC__V_CMPX_EQ_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_EQ_I16
-
-    Inst_VOPC__V_CMPX_EQ_I16::~Inst_VOPC__V_CMPX_EQ_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I16
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LE_I16::Inst_VOPC__V_CMPX_LE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LE_I16
-
-    Inst_VOPC__V_CMPX_LE_I16::~Inst_VOPC__V_CMPX_LE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I16
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GT_I16::Inst_VOPC__V_CMPX_GT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GT_I16
-
-    Inst_VOPC__V_CMPX_GT_I16::~Inst_VOPC__V_CMPX_GT_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I16
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NE_I16::Inst_VOPC__V_CMPX_NE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_NE_I16
-
-    Inst_VOPC__V_CMPX_NE_I16::~Inst_VOPC__V_CMPX_NE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I16
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GE_I16::Inst_VOPC__V_CMPX_GE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GE_I16
-
-    Inst_VOPC__V_CMPX_GE_I16::~Inst_VOPC__V_CMPX_GE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I16
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_T_I16::Inst_VOPC__V_CMPX_T_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_T_I16
-
-    Inst_VOPC__V_CMPX_T_I16::~Inst_VOPC__V_CMPX_T_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I16
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_F_U16::Inst_VOPC__V_CMPX_F_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_F_U16
-
-    Inst_VOPC__V_CMPX_F_U16::~Inst_VOPC__V_CMPX_F_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U16
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LT_U16::Inst_VOPC__V_CMPX_LT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LT_U16
-
-    Inst_VOPC__V_CMPX_LT_U16::~Inst_VOPC__V_CMPX_LT_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U16
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_EQ_U16::Inst_VOPC__V_CMPX_EQ_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_EQ_U16
-
-    Inst_VOPC__V_CMPX_EQ_U16::~Inst_VOPC__V_CMPX_EQ_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U16
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LE_U16::Inst_VOPC__V_CMPX_LE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LE_U16
-
-    Inst_VOPC__V_CMPX_LE_U16::~Inst_VOPC__V_CMPX_LE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U16
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GT_U16::Inst_VOPC__V_CMPX_GT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GT_U16
-
-    Inst_VOPC__V_CMPX_GT_U16::~Inst_VOPC__V_CMPX_GT_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U16
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NE_U16::Inst_VOPC__V_CMPX_NE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_NE_U16
-
-    Inst_VOPC__V_CMPX_NE_U16::~Inst_VOPC__V_CMPX_NE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U16
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GE_U16::Inst_VOPC__V_CMPX_GE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GE_U16
-
-    Inst_VOPC__V_CMPX_GE_U16::~Inst_VOPC__V_CMPX_GE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U16
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_T_U16::Inst_VOPC__V_CMPX_T_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_T_U16
-
-    Inst_VOPC__V_CMPX_T_U16::~Inst_VOPC__V_CMPX_T_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U16
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_F_I32::Inst_VOPC__V_CMP_F_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I32
-
-    Inst_VOPC__V_CMP_F_I32::~Inst_VOPC__V_CMP_F_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I32
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LT_I32::Inst_VOPC__V_CMP_LT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I32
-
-    Inst_VOPC__V_CMP_LT_I32::~Inst_VOPC__V_CMP_LT_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I32
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_EQ_I32::Inst_VOPC__V_CMP_EQ_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I32
-
-    Inst_VOPC__V_CMP_EQ_I32::~Inst_VOPC__V_CMP_EQ_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I32
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LE_I32::Inst_VOPC__V_CMP_LE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I32
-
-    Inst_VOPC__V_CMP_LE_I32::~Inst_VOPC__V_CMP_LE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I32
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GT_I32::Inst_VOPC__V_CMP_GT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I32
-
-    Inst_VOPC__V_CMP_GT_I32::~Inst_VOPC__V_CMP_GT_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I32
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NE_I32::Inst_VOPC__V_CMP_NE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I32
-
-    Inst_VOPC__V_CMP_NE_I32::~Inst_VOPC__V_CMP_NE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I32
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GE_I32::Inst_VOPC__V_CMP_GE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I32
-
-    Inst_VOPC__V_CMP_GE_I32::~Inst_VOPC__V_CMP_GE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I32
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_T_I32::Inst_VOPC__V_CMP_T_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I32
-
-    Inst_VOPC__V_CMP_T_I32::~Inst_VOPC__V_CMP_T_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I32
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_F_U32::Inst_VOPC__V_CMP_F_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U32
-
-    Inst_VOPC__V_CMP_F_U32::~Inst_VOPC__V_CMP_F_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U32
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LT_U32::Inst_VOPC__V_CMP_LT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U32
-
-    Inst_VOPC__V_CMP_LT_U32::~Inst_VOPC__V_CMP_LT_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U32
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_EQ_U32::Inst_VOPC__V_CMP_EQ_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U32
-
-    Inst_VOPC__V_CMP_EQ_U32::~Inst_VOPC__V_CMP_EQ_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U32
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LE_U32::Inst_VOPC__V_CMP_LE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U32
-
-    Inst_VOPC__V_CMP_LE_U32::~Inst_VOPC__V_CMP_LE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U32
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GT_U32::Inst_VOPC__V_CMP_GT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U32
-
-    Inst_VOPC__V_CMP_GT_U32::~Inst_VOPC__V_CMP_GT_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U32
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NE_U32::Inst_VOPC__V_CMP_NE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U32
-
-    Inst_VOPC__V_CMP_NE_U32::~Inst_VOPC__V_CMP_NE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U32
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GE_U32::Inst_VOPC__V_CMP_GE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U32
-
-    Inst_VOPC__V_CMP_GE_U32::~Inst_VOPC__V_CMP_GE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U32
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_T_U32::Inst_VOPC__V_CMP_T_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U32
-
-    Inst_VOPC__V_CMP_T_U32::~Inst_VOPC__V_CMP_T_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U32
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_F_I32::Inst_VOPC__V_CMPX_F_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_F_I32
-
-    Inst_VOPC__V_CMPX_F_I32::~Inst_VOPC__V_CMPX_F_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I32
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LT_I32::Inst_VOPC__V_CMPX_LT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LT_I32
-
-    Inst_VOPC__V_CMPX_LT_I32::~Inst_VOPC__V_CMPX_LT_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I32
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_EQ_I32::Inst_VOPC__V_CMPX_EQ_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_EQ_I32
-
-    Inst_VOPC__V_CMPX_EQ_I32::~Inst_VOPC__V_CMPX_EQ_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I32
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LE_I32::Inst_VOPC__V_CMPX_LE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LE_I32
-
-    Inst_VOPC__V_CMPX_LE_I32::~Inst_VOPC__V_CMPX_LE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I32
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GT_I32::Inst_VOPC__V_CMPX_GT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GT_I32
-
-    Inst_VOPC__V_CMPX_GT_I32::~Inst_VOPC__V_CMPX_GT_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I32
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NE_I32::Inst_VOPC__V_CMPX_NE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_NE_I32
-
-    Inst_VOPC__V_CMPX_NE_I32::~Inst_VOPC__V_CMPX_NE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I32
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GE_I32::Inst_VOPC__V_CMPX_GE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GE_I32
-
-    Inst_VOPC__V_CMPX_GE_I32::~Inst_VOPC__V_CMPX_GE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I32
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_T_I32::Inst_VOPC__V_CMPX_T_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_T_I32
-
-    Inst_VOPC__V_CMPX_T_I32::~Inst_VOPC__V_CMPX_T_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I32
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_F_U32::Inst_VOPC__V_CMPX_F_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_F_U32
-
-    Inst_VOPC__V_CMPX_F_U32::~Inst_VOPC__V_CMPX_F_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U32
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LT_U32::Inst_VOPC__V_CMPX_LT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LT_U32
-
-    Inst_VOPC__V_CMPX_LT_U32::~Inst_VOPC__V_CMPX_LT_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U32
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_EQ_U32::Inst_VOPC__V_CMPX_EQ_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_EQ_U32
-
-    Inst_VOPC__V_CMPX_EQ_U32::~Inst_VOPC__V_CMPX_EQ_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U32
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LE_U32::Inst_VOPC__V_CMPX_LE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LE_U32
-
-    Inst_VOPC__V_CMPX_LE_U32::~Inst_VOPC__V_CMPX_LE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U32
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GT_U32::Inst_VOPC__V_CMPX_GT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GT_U32
-
-    Inst_VOPC__V_CMPX_GT_U32::~Inst_VOPC__V_CMPX_GT_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U32
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NE_U32::Inst_VOPC__V_CMPX_NE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_NE_U32
-
-    Inst_VOPC__V_CMPX_NE_U32::~Inst_VOPC__V_CMPX_NE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U32
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GE_U32::Inst_VOPC__V_CMPX_GE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GE_U32
-
-    Inst_VOPC__V_CMPX_GE_U32::~Inst_VOPC__V_CMPX_GE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U32
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_T_U32::Inst_VOPC__V_CMPX_T_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_T_U32
-
-    Inst_VOPC__V_CMPX_T_U32::~Inst_VOPC__V_CMPX_T_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U32
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_F_I64::Inst_VOPC__V_CMP_F_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I64
-
-    Inst_VOPC__V_CMP_F_I64::~Inst_VOPC__V_CMP_F_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I64
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LT_I64::Inst_VOPC__V_CMP_LT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I64
-
-    Inst_VOPC__V_CMP_LT_I64::~Inst_VOPC__V_CMP_LT_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I64
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_EQ_I64::Inst_VOPC__V_CMP_EQ_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I64
-
-    Inst_VOPC__V_CMP_EQ_I64::~Inst_VOPC__V_CMP_EQ_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I64
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LE_I64::Inst_VOPC__V_CMP_LE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I64
-
-    Inst_VOPC__V_CMP_LE_I64::~Inst_VOPC__V_CMP_LE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I64
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GT_I64::Inst_VOPC__V_CMP_GT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I64
-
-    Inst_VOPC__V_CMP_GT_I64::~Inst_VOPC__V_CMP_GT_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I64
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NE_I64::Inst_VOPC__V_CMP_NE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I64
-
-    Inst_VOPC__V_CMP_NE_I64::~Inst_VOPC__V_CMP_NE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I64
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GE_I64::Inst_VOPC__V_CMP_GE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I64
-
-    Inst_VOPC__V_CMP_GE_I64::~Inst_VOPC__V_CMP_GE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I64
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_T_I64::Inst_VOPC__V_CMP_T_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I64
-
-    Inst_VOPC__V_CMP_T_I64::~Inst_VOPC__V_CMP_T_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I64
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_F_U64::Inst_VOPC__V_CMP_F_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U64
-
-    Inst_VOPC__V_CMP_F_U64::~Inst_VOPC__V_CMP_F_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U64
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LT_U64::Inst_VOPC__V_CMP_LT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U64
-
-    Inst_VOPC__V_CMP_LT_U64::~Inst_VOPC__V_CMP_LT_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U64
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_EQ_U64::Inst_VOPC__V_CMP_EQ_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U64
-
-    Inst_VOPC__V_CMP_EQ_U64::~Inst_VOPC__V_CMP_EQ_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U64
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_LE_U64::Inst_VOPC__V_CMP_LE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U64
-
-    Inst_VOPC__V_CMP_LE_U64::~Inst_VOPC__V_CMP_LE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U64
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GT_U64::Inst_VOPC__V_CMP_GT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U64
-
-    Inst_VOPC__V_CMP_GT_U64::~Inst_VOPC__V_CMP_GT_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U64
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_NE_U64::Inst_VOPC__V_CMP_NE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U64
-
-    Inst_VOPC__V_CMP_NE_U64::~Inst_VOPC__V_CMP_NE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U64
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_GE_U64::Inst_VOPC__V_CMP_GE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U64
-
-    Inst_VOPC__V_CMP_GE_U64::~Inst_VOPC__V_CMP_GE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U64
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMP_T_U64::Inst_VOPC__V_CMP_T_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U64
-
-    Inst_VOPC__V_CMP_T_U64::~Inst_VOPC__V_CMP_T_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U64
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_F_I64::Inst_VOPC__V_CMPX_F_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_F_I64
-
-    Inst_VOPC__V_CMPX_F_I64::~Inst_VOPC__V_CMPX_F_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I64
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LT_I64::Inst_VOPC__V_CMPX_LT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LT_I64
-
-    Inst_VOPC__V_CMPX_LT_I64::~Inst_VOPC__V_CMPX_LT_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I64
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_EQ_I64::Inst_VOPC__V_CMPX_EQ_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_EQ_I64
-
-    Inst_VOPC__V_CMPX_EQ_I64::~Inst_VOPC__V_CMPX_EQ_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I64
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LE_I64::Inst_VOPC__V_CMPX_LE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LE_I64
-
-    Inst_VOPC__V_CMPX_LE_I64::~Inst_VOPC__V_CMPX_LE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I64
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GT_I64::Inst_VOPC__V_CMPX_GT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GT_I64
-
-    Inst_VOPC__V_CMPX_GT_I64::~Inst_VOPC__V_CMPX_GT_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I64
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NE_I64::Inst_VOPC__V_CMPX_NE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_NE_I64
-
-    Inst_VOPC__V_CMPX_NE_I64::~Inst_VOPC__V_CMPX_NE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I64
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GE_I64::Inst_VOPC__V_CMPX_GE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GE_I64
-
-    Inst_VOPC__V_CMPX_GE_I64::~Inst_VOPC__V_CMPX_GE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I64
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_T_I64::Inst_VOPC__V_CMPX_T_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_T_I64
-
-    Inst_VOPC__V_CMPX_T_I64::~Inst_VOPC__V_CMPX_T_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I64
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_F_U64::Inst_VOPC__V_CMPX_F_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_F_U64
-
-    Inst_VOPC__V_CMPX_F_U64::~Inst_VOPC__V_CMPX_F_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U64
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LT_U64::Inst_VOPC__V_CMPX_LT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LT_U64
-
-    Inst_VOPC__V_CMPX_LT_U64::~Inst_VOPC__V_CMPX_LT_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U64
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_EQ_U64::Inst_VOPC__V_CMPX_EQ_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_EQ_U64
-
-    Inst_VOPC__V_CMPX_EQ_U64::~Inst_VOPC__V_CMPX_EQ_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U64
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_LE_U64::Inst_VOPC__V_CMPX_LE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_LE_U64
-
-    Inst_VOPC__V_CMPX_LE_U64::~Inst_VOPC__V_CMPX_LE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U64
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GT_U64::Inst_VOPC__V_CMPX_GT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GT_U64
-
-    Inst_VOPC__V_CMPX_GT_U64::~Inst_VOPC__V_CMPX_GT_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U64
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_NE_U64::Inst_VOPC__V_CMPX_NE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_NE_U64
-
-    Inst_VOPC__V_CMPX_NE_U64::~Inst_VOPC__V_CMPX_NE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U64
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_GE_U64::Inst_VOPC__V_CMPX_GE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_GE_U64
-
-    Inst_VOPC__V_CMPX_GE_U64::~Inst_VOPC__V_CMPX_GE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U64
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VOPC__V_CMPX_T_U64::Inst_VOPC__V_CMPX_T_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMPX_T_U64
-
-    Inst_VOPC__V_CMPX_T_U64::~Inst_VOPC__V_CMPX_T_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U64
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    }
-
-    Inst_VINTRP__V_INTERP_P1_F32::Inst_VINTRP__V_INTERP_P1_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_p1_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_P1_F32
-
-    Inst_VINTRP__V_INTERP_P1_F32::~Inst_VINTRP__V_INTERP_P1_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_P1_F32
-
-    // D.f = P10 * S.f + P0; parameter interpolation
-    void
-    Inst_VINTRP__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VINTRP__V_INTERP_P2_F32::Inst_VINTRP__V_INTERP_P2_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_p2_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_P2_F32
-
-    Inst_VINTRP__V_INTERP_P2_F32::~Inst_VINTRP__V_INTERP_P2_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_P2_F32
-
-    // D.f = P20 * S.f + D.f; parameter interpolation
-    void
-    Inst_VINTRP__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VINTRP__V_INTERP_MOV_F32::Inst_VINTRP__V_INTERP_MOV_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_mov_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_MOV_F32
-
-    Inst_VINTRP__V_INTERP_MOV_F32::~Inst_VINTRP__V_INTERP_MOV_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_MOV_F32
-
-    void
-    Inst_VINTRP__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_CLASS_F32::Inst_VOP3__V_CMP_CLASS_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_class_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_CLASS_F32
-
-    Inst_VOP3__V_CMP_CLASS_F32::~Inst_VOP3__V_CMP_CLASS_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F32
-
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
-    // The function reports true if the floating point value is any of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_CLASS_F32::Inst_VOP3__V_CMPX_CLASS_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_class_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_CLASS_F32
-
-    Inst_VOP3__V_CMPX_CLASS_F32::~Inst_VOP3__V_CMPX_CLASS_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F32
-
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.f
-    // The function reports true if the floating point value is any of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_CLASS_F64::Inst_VOP3__V_CMP_CLASS_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_class_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_CLASS_F64
-
-    Inst_VOP3__V_CMP_CLASS_F64::~Inst_VOP3__V_CMP_CLASS_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F64
-
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
-    // The function reports true if the floating point value is any of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_CLASS_F64::Inst_VOP3__V_CMPX_CLASS_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_class_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_CLASS_F64
-
-    Inst_VOP3__V_CMPX_CLASS_F64::~Inst_VOP3__V_CMPX_CLASS_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F64
-
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.d
-    // The function reports true if the floating point value is any of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_CLASS_F16::Inst_VOP3__V_CMP_CLASS_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_class_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_CLASS_F16
-
-    Inst_VOP3__V_CMP_CLASS_F16::~Inst_VOP3__V_CMP_CLASS_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F16
-
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
-    // The function reports true if the floating point value is any of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_CLASS_F16::Inst_VOP3__V_CMPX_CLASS_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_class_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_CLASS_F16
-
-    Inst_VOP3__V_CMPX_CLASS_F16::~Inst_VOP3__V_CMPX_CLASS_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F16
-
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.f16
-    // The function reports true if the floating point value is any of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_F_F16::Inst_VOP3__V_CMP_F_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_f_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_F_F16
-
-    Inst_VOP3__V_CMP_F_F16::~Inst_VOP3__V_CMP_F_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F16
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_LT_F16::Inst_VOP3__V_CMP_LT_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LT_F16
-
-    Inst_VOP3__V_CMP_LT_F16::~Inst_VOP3__V_CMP_LT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F16
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_EQ_F16::Inst_VOP3__V_CMP_EQ_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_eq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_EQ_F16
-
-    Inst_VOP3__V_CMP_EQ_F16::~Inst_VOP3__V_CMP_EQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F16
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_LE_F16::Inst_VOP3__V_CMP_LE_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_le_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LE_F16
-
-    Inst_VOP3__V_CMP_LE_F16::~Inst_VOP3__V_CMP_LE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F16
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_GT_F16::Inst_VOP3__V_CMP_GT_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_gt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_GT_F16
-
-    Inst_VOP3__V_CMP_GT_F16::~Inst_VOP3__V_CMP_GT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F16
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_LG_F16::Inst_VOP3__V_CMP_LG_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LG_F16
-
-    Inst_VOP3__V_CMP_LG_F16::~Inst_VOP3__V_CMP_LG_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F16
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_GE_F16::Inst_VOP3__V_CMP_GE_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_GE_F16
-
-    Inst_VOP3__V_CMP_GE_F16::~Inst_VOP3__V_CMP_GE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F16
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_O_F16::Inst_VOP3__V_CMP_O_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_o_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_O_F16
-
-    Inst_VOP3__V_CMP_O_F16::~Inst_VOP3__V_CMP_O_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F16
-
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_U_F16::Inst_VOP3__V_CMP_U_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_u_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_U_F16
-
-    Inst_VOP3__V_CMP_U_F16::~Inst_VOP3__V_CMP_U_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F16
-
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_NGE_F16::Inst_VOP3__V_CMP_NGE_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NGE_F16
-
-    Inst_VOP3__V_CMP_NGE_F16::~Inst_VOP3__V_CMP_NGE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F16
-
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_NLG_F16::Inst_VOP3__V_CMP_NLG_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nlg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLG_F16
-
-    Inst_VOP3__V_CMP_NLG_F16::~Inst_VOP3__V_CMP_NLG_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F16
-
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_NGT_F16::Inst_VOP3__V_CMP_NGT_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ngt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NGT_F16
-
-    Inst_VOP3__V_CMP_NGT_F16::~Inst_VOP3__V_CMP_NGT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F16
-
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_NLE_F16::Inst_VOP3__V_CMP_NLE_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nle_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLE_F16
-
-    Inst_VOP3__V_CMP_NLE_F16::~Inst_VOP3__V_CMP_NLE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F16
-
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_NEQ_F16::Inst_VOP3__V_CMP_NEQ_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_neq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NEQ_F16
-
-    Inst_VOP3__V_CMP_NEQ_F16::~Inst_VOP3__V_CMP_NEQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F16
-
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_NLT_F16::Inst_VOP3__V_CMP_NLT_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nlt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLT_F16
-
-    Inst_VOP3__V_CMP_NLT_F16::~Inst_VOP3__V_CMP_NLT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F16
-
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMP_TRU_F16::Inst_VOP3__V_CMP_TRU_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_tru_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_TRU_F16
-
-    Inst_VOP3__V_CMP_TRU_F16::~Inst_VOP3__V_CMP_TRU_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F16
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_F_F16::Inst_VOP3__V_CMPX_F_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_f_f16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_F_F16
-
-    Inst_VOP3__V_CMPX_F_F16::~Inst_VOP3__V_CMPX_F_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F16
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LT_F16::Inst_VOP3__V_CMPX_LT_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_LT_F16
-
-    Inst_VOP3__V_CMPX_LT_F16::~Inst_VOP3__V_CMPX_LT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F16
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_EQ_F16::Inst_VOP3__V_CMPX_EQ_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_eq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_EQ_F16
-
-    Inst_VOP3__V_CMPX_EQ_F16::~Inst_VOP3__V_CMPX_EQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F16
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_LE_F16::Inst_VOP3__V_CMPX_LE_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_le_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_LE_F16
-
-    Inst_VOP3__V_CMPX_LE_F16::~Inst_VOP3__V_CMPX_LE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F16
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_GT_F16::Inst_VOP3__V_CMPX_GT_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_gt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_GT_F16
-
-    Inst_VOP3__V_CMPX_GT_F16::~Inst_VOP3__V_CMPX_GT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F16
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_LG_F16::Inst_VOP3__V_CMPX_LG_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_LG_F16
-
-    Inst_VOP3__V_CMPX_LG_F16::~Inst_VOP3__V_CMPX_LG_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F16
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_GE_F16::Inst_VOP3__V_CMPX_GE_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_GE_F16
-
-    Inst_VOP3__V_CMPX_GE_F16::~Inst_VOP3__V_CMPX_GE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F16
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_O_F16::Inst_VOP3__V_CMPX_O_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_o_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_O_F16
-
-    Inst_VOP3__V_CMPX_O_F16::~Inst_VOP3__V_CMPX_O_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F16
-
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_U_F16::Inst_VOP3__V_CMPX_U_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_u_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_U_F16
-
-    Inst_VOP3__V_CMPX_U_F16::~Inst_VOP3__V_CMPX_U_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F16
-
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_NGE_F16::Inst_VOP3__V_CMPX_NGE_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_NGE_F16
-
-    Inst_VOP3__V_CMPX_NGE_F16::~Inst_VOP3__V_CMPX_NGE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F16
-
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_NLG_F16::Inst_VOP3__V_CMPX_NLG_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nlg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_NLG_F16
-
-    Inst_VOP3__V_CMPX_NLG_F16::~Inst_VOP3__V_CMPX_NLG_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F16
-
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_NGT_F16::Inst_VOP3__V_CMPX_NGT_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ngt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_NGT_F16
-
-    Inst_VOP3__V_CMPX_NGT_F16::~Inst_VOP3__V_CMPX_NGT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F16
-
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_NLE_F16::Inst_VOP3__V_CMPX_NLE_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nle_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_NLE_F16
-
-    Inst_VOP3__V_CMPX_NLE_F16::~Inst_VOP3__V_CMPX_NLE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F16
-
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_NEQ_F16::Inst_VOP3__V_CMPX_NEQ_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_neq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_NEQ_F16
-
-    Inst_VOP3__V_CMPX_NEQ_F16::~Inst_VOP3__V_CMPX_NEQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F16
-
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_NLT_F16::Inst_VOP3__V_CMPX_NLT_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nlt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_NLT_F16
-
-    Inst_VOP3__V_CMPX_NLT_F16::~Inst_VOP3__V_CMPX_NLT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F16
-
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CMPX_TRU_F16::Inst_VOP3__V_CMPX_TRU_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_tru_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMPX_TRU_F16
-
-    Inst_VOP3__V_CMPX_TRU_F16::~Inst_VOP3__V_CMPX_TRU_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F16
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_F_F32::Inst_VOP3__V_CMP_F_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_f_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_F_F32
-
-    Inst_VOP3__V_CMP_F_F32::~Inst_VOP3__V_CMP_F_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F32
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LT_F32::Inst_VOP3__V_CMP_LT_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LT_F32
-
-    Inst_VOP3__V_CMP_LT_F32::~Inst_VOP3__V_CMP_LT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F32
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_EQ_F32::Inst_VOP3__V_CMP_EQ_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_eq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_EQ_F32
-
-    Inst_VOP3__V_CMP_EQ_F32::~Inst_VOP3__V_CMP_EQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F32
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LE_F32::Inst_VOP3__V_CMP_LE_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_le_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LE_F32
-
-    Inst_VOP3__V_CMP_LE_F32::~Inst_VOP3__V_CMP_LE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F32
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GT_F32::Inst_VOP3__V_CMP_GT_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_gt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_GT_F32
-
-    Inst_VOP3__V_CMP_GT_F32::~Inst_VOP3__V_CMP_GT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F32
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LG_F32::Inst_VOP3__V_CMP_LG_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LG_F32
-
-    Inst_VOP3__V_CMP_LG_F32::~Inst_VOP3__V_CMP_LG_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F32
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GE_F32::Inst_VOP3__V_CMP_GE_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_GE_F32
-
-    Inst_VOP3__V_CMP_GE_F32::~Inst_VOP3__V_CMP_GE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F32
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_O_F32::Inst_VOP3__V_CMP_O_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_o_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_O_F32
-
-    Inst_VOP3__V_CMP_O_F32::~Inst_VOP3__V_CMP_O_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F32
-
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_U_F32::Inst_VOP3__V_CMP_U_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_u_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_U_F32
-
-    Inst_VOP3__V_CMP_U_F32::~Inst_VOP3__V_CMP_U_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F32
-
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NGE_F32::Inst_VOP3__V_CMP_NGE_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NGE_F32
-
-    Inst_VOP3__V_CMP_NGE_F32::~Inst_VOP3__V_CMP_NGE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F32
-
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NLG_F32::Inst_VOP3__V_CMP_NLG_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nlg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLG_F32
-
-    Inst_VOP3__V_CMP_NLG_F32::~Inst_VOP3__V_CMP_NLG_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F32
-
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NGT_F32::Inst_VOP3__V_CMP_NGT_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ngt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NGT_F32
-
-    Inst_VOP3__V_CMP_NGT_F32::~Inst_VOP3__V_CMP_NGT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F32
-
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NLE_F32::Inst_VOP3__V_CMP_NLE_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nle_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLE_F32
-
-    Inst_VOP3__V_CMP_NLE_F32::~Inst_VOP3__V_CMP_NLE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F32
-
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NEQ_F32::Inst_VOP3__V_CMP_NEQ_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_neq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NEQ_F32
-
-    Inst_VOP3__V_CMP_NEQ_F32::~Inst_VOP3__V_CMP_NEQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F32
-
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NLT_F32::Inst_VOP3__V_CMP_NLT_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nlt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLT_F32
-
-    Inst_VOP3__V_CMP_NLT_F32::~Inst_VOP3__V_CMP_NLT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F32
-
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_TRU_F32::Inst_VOP3__V_CMP_TRU_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_tru_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_TRU_F32
-
-    Inst_VOP3__V_CMP_TRU_F32::~Inst_VOP3__V_CMP_TRU_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F32
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_F_F32::Inst_VOP3__V_CMPX_F_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_f_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_F_F32
-
-    Inst_VOP3__V_CMPX_F_F32::~Inst_VOP3__V_CMPX_F_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F32
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LT_F32::Inst_VOP3__V_CMPX_LT_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_LT_F32
-
-    Inst_VOP3__V_CMPX_LT_F32::~Inst_VOP3__V_CMPX_LT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F32
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_EQ_F32::Inst_VOP3__V_CMPX_EQ_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_eq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_EQ_F32
-
-    Inst_VOP3__V_CMPX_EQ_F32::~Inst_VOP3__V_CMPX_EQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F32
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LE_F32::Inst_VOP3__V_CMPX_LE_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_le_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_LE_F32
-
-    Inst_VOP3__V_CMPX_LE_F32::~Inst_VOP3__V_CMPX_LE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F32
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GT_F32::Inst_VOP3__V_CMPX_GT_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_gt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_GT_F32
-
-    Inst_VOP3__V_CMPX_GT_F32::~Inst_VOP3__V_CMPX_GT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F32
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LG_F32::Inst_VOP3__V_CMPX_LG_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_LG_F32
-
-    Inst_VOP3__V_CMPX_LG_F32::~Inst_VOP3__V_CMPX_LG_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F32
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GE_F32::Inst_VOP3__V_CMPX_GE_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_GE_F32
-
-    Inst_VOP3__V_CMPX_GE_F32::~Inst_VOP3__V_CMPX_GE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F32
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_O_F32::Inst_VOP3__V_CMPX_O_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_o_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_O_F32
-
-    Inst_VOP3__V_CMPX_O_F32::~Inst_VOP3__V_CMPX_O_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F32
-
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_U_F32::Inst_VOP3__V_CMPX_U_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_u_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_U_F32
-
-    Inst_VOP3__V_CMPX_U_F32::~Inst_VOP3__V_CMPX_U_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F32
-
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                        || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NGE_F32::Inst_VOP3__V_CMPX_NGE_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_NGE_F32
-
-    Inst_VOP3__V_CMPX_NGE_F32::~Inst_VOP3__V_CMPX_NGE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F32
-
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NLG_F32::Inst_VOP3__V_CMPX_NLG_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nlg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_NLG_F32
-
-    Inst_VOP3__V_CMPX_NLG_F32::~Inst_VOP3__V_CMPX_NLG_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F32
-
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NGT_F32::Inst_VOP3__V_CMPX_NGT_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ngt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_NGT_F32
-
-    Inst_VOP3__V_CMPX_NGT_F32::~Inst_VOP3__V_CMPX_NGT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F32
-
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NLE_F32::Inst_VOP3__V_CMPX_NLE_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nle_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_NLE_F32
-
-    Inst_VOP3__V_CMPX_NLE_F32::~Inst_VOP3__V_CMPX_NLE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F32
-
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NEQ_F32::Inst_VOP3__V_CMPX_NEQ_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_neq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_NEQ_F32
-
-    Inst_VOP3__V_CMPX_NEQ_F32::~Inst_VOP3__V_CMPX_NEQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F32
-
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NLT_F32::Inst_VOP3__V_CMPX_NLT_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nlt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_NLT_F32
-
-    Inst_VOP3__V_CMPX_NLT_F32::~Inst_VOP3__V_CMPX_NLT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F32
-
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_TRU_F32::Inst_VOP3__V_CMPX_TRU_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_tru_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMPX_TRU_F32
-
-    Inst_VOP3__V_CMPX_TRU_F32::~Inst_VOP3__V_CMPX_TRU_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F32
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_F_F64::Inst_VOP3__V_CMP_F_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_f_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_F_F64
-
-    Inst_VOP3__V_CMP_F_F64::~Inst_VOP3__V_CMP_F_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F64
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LT_F64::Inst_VOP3__V_CMP_LT_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LT_F64
-
-    Inst_VOP3__V_CMP_LT_F64::~Inst_VOP3__V_CMP_LT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F64
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_EQ_F64::Inst_VOP3__V_CMP_EQ_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_eq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_EQ_F64
-
-    Inst_VOP3__V_CMP_EQ_F64::~Inst_VOP3__V_CMP_EQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F64
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LE_F64::Inst_VOP3__V_CMP_LE_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_le_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LE_F64
-
-    Inst_VOP3__V_CMP_LE_F64::~Inst_VOP3__V_CMP_LE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F64
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GT_F64::Inst_VOP3__V_CMP_GT_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_gt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_GT_F64
-
-    Inst_VOP3__V_CMP_GT_F64::~Inst_VOP3__V_CMP_GT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F64
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LG_F64::Inst_VOP3__V_CMP_LG_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LG_F64
-
-    Inst_VOP3__V_CMP_LG_F64::~Inst_VOP3__V_CMP_LG_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F64
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GE_F64::Inst_VOP3__V_CMP_GE_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_GE_F64
-
-    Inst_VOP3__V_CMP_GE_F64::~Inst_VOP3__V_CMP_GE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F64
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_O_F64::Inst_VOP3__V_CMP_O_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_o_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_O_F64
-
-    Inst_VOP3__V_CMP_O_F64::~Inst_VOP3__V_CMP_O_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F64
-
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_U_F64::Inst_VOP3__V_CMP_U_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_u_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_U_F64
-
-    Inst_VOP3__V_CMP_U_F64::~Inst_VOP3__V_CMP_U_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F64
-
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NGE_F64::Inst_VOP3__V_CMP_NGE_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NGE_F64
-
-    Inst_VOP3__V_CMP_NGE_F64::~Inst_VOP3__V_CMP_NGE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F64
-
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NLG_F64::Inst_VOP3__V_CMP_NLG_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nlg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLG_F64
-
-    Inst_VOP3__V_CMP_NLG_F64::~Inst_VOP3__V_CMP_NLG_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F64
-
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NGT_F64::Inst_VOP3__V_CMP_NGT_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ngt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NGT_F64
-
-    Inst_VOP3__V_CMP_NGT_F64::~Inst_VOP3__V_CMP_NGT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F64
-
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NLE_F64::Inst_VOP3__V_CMP_NLE_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nle_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLE_F64
-
-    Inst_VOP3__V_CMP_NLE_F64::~Inst_VOP3__V_CMP_NLE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F64
-
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NEQ_F64::Inst_VOP3__V_CMP_NEQ_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_neq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NEQ_F64
-
-    Inst_VOP3__V_CMP_NEQ_F64::~Inst_VOP3__V_CMP_NEQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F64
-
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NLT_F64::Inst_VOP3__V_CMP_NLT_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_nlt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLT_F64
-
-    Inst_VOP3__V_CMP_NLT_F64::~Inst_VOP3__V_CMP_NLT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F64
-
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_TRU_F64::Inst_VOP3__V_CMP_TRU_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_tru_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_TRU_F64
-
-    Inst_VOP3__V_CMP_TRU_F64::~Inst_VOP3__V_CMP_TRU_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F64
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_F_F64::Inst_VOP3__V_CMPX_F_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_f_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_F_F64
-
-    Inst_VOP3__V_CMPX_F_F64::~Inst_VOP3__V_CMPX_F_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F64
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LT_F64::Inst_VOP3__V_CMPX_LT_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_LT_F64
-
-    Inst_VOP3__V_CMPX_LT_F64::~Inst_VOP3__V_CMPX_LT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F64
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_EQ_F64::Inst_VOP3__V_CMPX_EQ_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_eq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_EQ_F64
-
-    Inst_VOP3__V_CMPX_EQ_F64::~Inst_VOP3__V_CMPX_EQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F64
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LE_F64::Inst_VOP3__V_CMPX_LE_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_le_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_LE_F64
-
-    Inst_VOP3__V_CMPX_LE_F64::~Inst_VOP3__V_CMPX_LE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F64
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GT_F64::Inst_VOP3__V_CMPX_GT_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_gt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_GT_F64
-
-    Inst_VOP3__V_CMPX_GT_F64::~Inst_VOP3__V_CMPX_GT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F64
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LG_F64::Inst_VOP3__V_CMPX_LG_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_LG_F64
-
-    Inst_VOP3__V_CMPX_LG_F64::~Inst_VOP3__V_CMPX_LG_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F64
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GE_F64::Inst_VOP3__V_CMPX_GE_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_GE_F64
-
-    Inst_VOP3__V_CMPX_GE_F64::~Inst_VOP3__V_CMPX_GE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F64
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_O_F64::Inst_VOP3__V_CMPX_O_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_o_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_O_F64
-
-    Inst_VOP3__V_CMPX_O_F64::~Inst_VOP3__V_CMPX_O_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F64
-
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_U_F64::Inst_VOP3__V_CMPX_U_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_u_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_U_F64
-
-    Inst_VOP3__V_CMPX_U_F64::~Inst_VOP3__V_CMPX_U_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F64
-
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NGE_F64::Inst_VOP3__V_CMPX_NGE_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_NGE_F64
-
-    Inst_VOP3__V_CMPX_NGE_F64::~Inst_VOP3__V_CMPX_NGE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F64
-
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NLG_F64::Inst_VOP3__V_CMPX_NLG_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nlg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_NLG_F64
-
-    Inst_VOP3__V_CMPX_NLG_F64::~Inst_VOP3__V_CMPX_NLG_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F64
-
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NGT_F64::Inst_VOP3__V_CMPX_NGT_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ngt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_NGT_F64
-
-    Inst_VOP3__V_CMPX_NGT_F64::~Inst_VOP3__V_CMPX_NGT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F64
-
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NLE_F64::Inst_VOP3__V_CMPX_NLE_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nle_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_NLE_F64
-
-    Inst_VOP3__V_CMPX_NLE_F64::~Inst_VOP3__V_CMPX_NLE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F64
-
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NEQ_F64::Inst_VOP3__V_CMPX_NEQ_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_neq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_NEQ_F64
-
-    Inst_VOP3__V_CMPX_NEQ_F64::~Inst_VOP3__V_CMPX_NEQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F64
-
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NLT_F64::Inst_VOP3__V_CMPX_NLT_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_nlt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_NLT_F64
-
-    Inst_VOP3__V_CMPX_NLT_F64::~Inst_VOP3__V_CMPX_NLT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F64
-
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_TRU_F64::Inst_VOP3__V_CMPX_TRU_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_tru_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMPX_TRU_F64
-
-    Inst_VOP3__V_CMPX_TRU_F64::~Inst_VOP3__V_CMPX_TRU_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F64
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_F_I16::Inst_VOP3__V_CMP_F_I16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_f_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I16
-
-    Inst_VOP3__V_CMP_F_I16::~Inst_VOP3__V_CMP_F_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I16
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LT_I16::Inst_VOP3__V_CMP_LT_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lt_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I16
-
-    Inst_VOP3__V_CMP_LT_I16::~Inst_VOP3__V_CMP_LT_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I16
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_EQ_I16::Inst_VOP3__V_CMP_EQ_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_eq_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I16
-
-    Inst_VOP3__V_CMP_EQ_I16::~Inst_VOP3__V_CMP_EQ_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I16
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LE_I16::Inst_VOP3__V_CMP_LE_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_le_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I16
-
-    Inst_VOP3__V_CMP_LE_I16::~Inst_VOP3__V_CMP_LE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I16
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GT_I16::Inst_VOP3__V_CMP_GT_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_gt_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I16
-
-    Inst_VOP3__V_CMP_GT_I16::~Inst_VOP3__V_CMP_GT_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I16
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NE_I16::Inst_VOP3__V_CMP_NE_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ne_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I16
-
-    Inst_VOP3__V_CMP_NE_I16::~Inst_VOP3__V_CMP_NE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I16
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GE_I16::Inst_VOP3__V_CMP_GE_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ge_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I16
-
-    Inst_VOP3__V_CMP_GE_I16::~Inst_VOP3__V_CMP_GE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I16
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_T_I16::Inst_VOP3__V_CMP_T_I16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_t_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I16
-
-    Inst_VOP3__V_CMP_T_I16::~Inst_VOP3__V_CMP_T_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I16
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_F_U16::Inst_VOP3__V_CMP_F_U16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_f_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U16
-
-    Inst_VOP3__V_CMP_F_U16::~Inst_VOP3__V_CMP_F_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U16
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LT_U16::Inst_VOP3__V_CMP_LT_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lt_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U16
-
-    Inst_VOP3__V_CMP_LT_U16::~Inst_VOP3__V_CMP_LT_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U16
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_EQ_U16::Inst_VOP3__V_CMP_EQ_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_eq_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U16
-
-    Inst_VOP3__V_CMP_EQ_U16::~Inst_VOP3__V_CMP_EQ_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U16
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LE_U16::Inst_VOP3__V_CMP_LE_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_le_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U16
-
-    Inst_VOP3__V_CMP_LE_U16::~Inst_VOP3__V_CMP_LE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U16
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GT_U16::Inst_VOP3__V_CMP_GT_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_gt_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U16
-
-    Inst_VOP3__V_CMP_GT_U16::~Inst_VOP3__V_CMP_GT_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U16
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NE_U16::Inst_VOP3__V_CMP_NE_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ne_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U16
-
-    Inst_VOP3__V_CMP_NE_U16::~Inst_VOP3__V_CMP_NE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U16
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GE_U16::Inst_VOP3__V_CMP_GE_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ge_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U16
-
-    Inst_VOP3__V_CMP_GE_U16::~Inst_VOP3__V_CMP_GE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U16
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_T_U16::Inst_VOP3__V_CMP_T_U16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_t_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U16
-
-    Inst_VOP3__V_CMP_T_U16::~Inst_VOP3__V_CMP_T_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U16
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_F_I16::Inst_VOP3__V_CMPX_F_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_f_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_F_I16
-
-    Inst_VOP3__V_CMPX_F_I16::~Inst_VOP3__V_CMPX_F_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I16
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LT_I16::Inst_VOP3__V_CMPX_LT_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lt_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LT_I16
-
-    Inst_VOP3__V_CMPX_LT_I16::~Inst_VOP3__V_CMPX_LT_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I16
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_EQ_I16::Inst_VOP3__V_CMPX_EQ_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_eq_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_EQ_I16
-
-    Inst_VOP3__V_CMPX_EQ_I16::~Inst_VOP3__V_CMPX_EQ_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I16
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LE_I16::Inst_VOP3__V_CMPX_LE_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_le_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LE_I16
-
-    Inst_VOP3__V_CMPX_LE_I16::~Inst_VOP3__V_CMPX_LE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I16
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GT_I16::Inst_VOP3__V_CMPX_GT_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_gt_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GT_I16
-
-    Inst_VOP3__V_CMPX_GT_I16::~Inst_VOP3__V_CMPX_GT_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I16
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NE_I16::Inst_VOP3__V_CMPX_NE_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ne_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_NE_I16
-
-    Inst_VOP3__V_CMPX_NE_I16::~Inst_VOP3__V_CMPX_NE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I16
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GE_I16::Inst_VOP3__V_CMPX_GE_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ge_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GE_I16
-
-    Inst_VOP3__V_CMPX_GE_I16::~Inst_VOP3__V_CMPX_GE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I16
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_T_I16::Inst_VOP3__V_CMPX_T_I16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_t_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_T_I16
-
-    Inst_VOP3__V_CMPX_T_I16::~Inst_VOP3__V_CMPX_T_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I16
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_F_U16::Inst_VOP3__V_CMPX_F_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_f_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_F_U16
-
-    Inst_VOP3__V_CMPX_F_U16::~Inst_VOP3__V_CMPX_F_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U16
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LT_U16::Inst_VOP3__V_CMPX_LT_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lt_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LT_U16
-
-    Inst_VOP3__V_CMPX_LT_U16::~Inst_VOP3__V_CMPX_LT_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U16
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_EQ_U16::Inst_VOP3__V_CMPX_EQ_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_eq_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_EQ_U16
-
-    Inst_VOP3__V_CMPX_EQ_U16::~Inst_VOP3__V_CMPX_EQ_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U16
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LE_U16::Inst_VOP3__V_CMPX_LE_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_le_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LE_U16
-
-    Inst_VOP3__V_CMPX_LE_U16::~Inst_VOP3__V_CMPX_LE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U16
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GT_U16::Inst_VOP3__V_CMPX_GT_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_gt_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GT_U16
-
-    Inst_VOP3__V_CMPX_GT_U16::~Inst_VOP3__V_CMPX_GT_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U16
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NE_U16::Inst_VOP3__V_CMPX_NE_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ne_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_NE_U16
-
-    Inst_VOP3__V_CMPX_NE_U16::~Inst_VOP3__V_CMPX_NE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U16
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GE_U16::Inst_VOP3__V_CMPX_GE_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ge_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GE_U16
-
-    Inst_VOP3__V_CMPX_GE_U16::~Inst_VOP3__V_CMPX_GE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U16
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_T_U16::Inst_VOP3__V_CMPX_T_U16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_t_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_T_U16
-
-    Inst_VOP3__V_CMPX_T_U16::~Inst_VOP3__V_CMPX_T_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U16
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_F_I32::Inst_VOP3__V_CMP_F_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_f_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I32
-
-    Inst_VOP3__V_CMP_F_I32::~Inst_VOP3__V_CMP_F_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I32
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LT_I32::Inst_VOP3__V_CMP_LT_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lt_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I32
-
-    Inst_VOP3__V_CMP_LT_I32::~Inst_VOP3__V_CMP_LT_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I32
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_EQ_I32::Inst_VOP3__V_CMP_EQ_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_eq_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I32
-
-    Inst_VOP3__V_CMP_EQ_I32::~Inst_VOP3__V_CMP_EQ_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I32
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LE_I32::Inst_VOP3__V_CMP_LE_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_le_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I32
-
-    Inst_VOP3__V_CMP_LE_I32::~Inst_VOP3__V_CMP_LE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I32
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GT_I32::Inst_VOP3__V_CMP_GT_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_gt_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I32
-
-    Inst_VOP3__V_CMP_GT_I32::~Inst_VOP3__V_CMP_GT_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I32
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NE_I32::Inst_VOP3__V_CMP_NE_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ne_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I32
-
-    Inst_VOP3__V_CMP_NE_I32::~Inst_VOP3__V_CMP_NE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I32
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GE_I32::Inst_VOP3__V_CMP_GE_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ge_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I32
-
-    Inst_VOP3__V_CMP_GE_I32::~Inst_VOP3__V_CMP_GE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I32
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_T_I32::Inst_VOP3__V_CMP_T_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_t_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I32
-
-    Inst_VOP3__V_CMP_T_I32::~Inst_VOP3__V_CMP_T_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I32
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_F_U32::Inst_VOP3__V_CMP_F_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_f_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U32
-
-    Inst_VOP3__V_CMP_F_U32::~Inst_VOP3__V_CMP_F_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U32
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LT_U32::Inst_VOP3__V_CMP_LT_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lt_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U32
-
-    Inst_VOP3__V_CMP_LT_U32::~Inst_VOP3__V_CMP_LT_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U32
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_EQ_U32::Inst_VOP3__V_CMP_EQ_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_eq_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U32
-
-    Inst_VOP3__V_CMP_EQ_U32::~Inst_VOP3__V_CMP_EQ_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U32
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LE_U32::Inst_VOP3__V_CMP_LE_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_le_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U32
-
-    Inst_VOP3__V_CMP_LE_U32::~Inst_VOP3__V_CMP_LE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U32
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GT_U32::Inst_VOP3__V_CMP_GT_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_gt_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U32
-
-    Inst_VOP3__V_CMP_GT_U32::~Inst_VOP3__V_CMP_GT_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U32
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NE_U32::Inst_VOP3__V_CMP_NE_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ne_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U32
-
-    Inst_VOP3__V_CMP_NE_U32::~Inst_VOP3__V_CMP_NE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U32
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GE_U32::Inst_VOP3__V_CMP_GE_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ge_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U32
-
-    Inst_VOP3__V_CMP_GE_U32::~Inst_VOP3__V_CMP_GE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U32
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_T_U32::Inst_VOP3__V_CMP_T_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_t_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U32
-
-    Inst_VOP3__V_CMP_T_U32::~Inst_VOP3__V_CMP_T_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U32
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_F_I32::Inst_VOP3__V_CMPX_F_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_f_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_F_I32
-
-    Inst_VOP3__V_CMPX_F_I32::~Inst_VOP3__V_CMPX_F_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I32
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LT_I32::Inst_VOP3__V_CMPX_LT_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lt_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LT_I32
-
-    Inst_VOP3__V_CMPX_LT_I32::~Inst_VOP3__V_CMPX_LT_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I32
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_EQ_I32::Inst_VOP3__V_CMPX_EQ_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_eq_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_EQ_I32
-
-    Inst_VOP3__V_CMPX_EQ_I32::~Inst_VOP3__V_CMPX_EQ_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I32
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LE_I32::Inst_VOP3__V_CMPX_LE_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_le_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LE_I32
-
-    Inst_VOP3__V_CMPX_LE_I32::~Inst_VOP3__V_CMPX_LE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I32
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GT_I32::Inst_VOP3__V_CMPX_GT_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_gt_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GT_I32
-
-    Inst_VOP3__V_CMPX_GT_I32::~Inst_VOP3__V_CMPX_GT_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I32
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NE_I32::Inst_VOP3__V_CMPX_NE_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ne_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_NE_I32
-
-    Inst_VOP3__V_CMPX_NE_I32::~Inst_VOP3__V_CMPX_NE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I32
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GE_I32::Inst_VOP3__V_CMPX_GE_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ge_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GE_I32
-
-    Inst_VOP3__V_CMPX_GE_I32::~Inst_VOP3__V_CMPX_GE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I32
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_T_I32::Inst_VOP3__V_CMPX_T_I32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_t_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_T_I32
-
-    Inst_VOP3__V_CMPX_T_I32::~Inst_VOP3__V_CMPX_T_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I32
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_F_U32::Inst_VOP3__V_CMPX_F_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_f_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_F_U32
-
-    Inst_VOP3__V_CMPX_F_U32::~Inst_VOP3__V_CMPX_F_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U32
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LT_U32::Inst_VOP3__V_CMPX_LT_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lt_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LT_U32
-
-    Inst_VOP3__V_CMPX_LT_U32::~Inst_VOP3__V_CMPX_LT_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U32
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_EQ_U32::Inst_VOP3__V_CMPX_EQ_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_eq_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_EQ_U32
-
-    Inst_VOP3__V_CMPX_EQ_U32::~Inst_VOP3__V_CMPX_EQ_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U32
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LE_U32::Inst_VOP3__V_CMPX_LE_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_le_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LE_U32
-
-    Inst_VOP3__V_CMPX_LE_U32::~Inst_VOP3__V_CMPX_LE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U32
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GT_U32::Inst_VOP3__V_CMPX_GT_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_gt_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GT_U32
-
-    Inst_VOP3__V_CMPX_GT_U32::~Inst_VOP3__V_CMPX_GT_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U32
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NE_U32::Inst_VOP3__V_CMPX_NE_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ne_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_NE_U32
-
-    Inst_VOP3__V_CMPX_NE_U32::~Inst_VOP3__V_CMPX_NE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U32
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GE_U32::Inst_VOP3__V_CMPX_GE_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ge_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GE_U32
-
-    Inst_VOP3__V_CMPX_GE_U32::~Inst_VOP3__V_CMPX_GE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U32
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_T_U32::Inst_VOP3__V_CMPX_T_U32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_t_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_T_U32
-
-    Inst_VOP3__V_CMPX_T_U32::~Inst_VOP3__V_CMPX_T_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U32
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_F_I64::Inst_VOP3__V_CMP_F_I64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_f_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I64
-
-    Inst_VOP3__V_CMP_F_I64::~Inst_VOP3__V_CMP_F_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I64
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LT_I64::Inst_VOP3__V_CMP_LT_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lt_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I64
-
-    Inst_VOP3__V_CMP_LT_I64::~Inst_VOP3__V_CMP_LT_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I64
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_EQ_I64::Inst_VOP3__V_CMP_EQ_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_eq_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I64
-
-    Inst_VOP3__V_CMP_EQ_I64::~Inst_VOP3__V_CMP_EQ_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I64
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LE_I64::Inst_VOP3__V_CMP_LE_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_le_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I64
-
-    Inst_VOP3__V_CMP_LE_I64::~Inst_VOP3__V_CMP_LE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I64
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GT_I64::Inst_VOP3__V_CMP_GT_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_gt_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I64
-
-    Inst_VOP3__V_CMP_GT_I64::~Inst_VOP3__V_CMP_GT_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I64
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NE_I64::Inst_VOP3__V_CMP_NE_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ne_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I64
-
-    Inst_VOP3__V_CMP_NE_I64::~Inst_VOP3__V_CMP_NE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I64
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GE_I64::Inst_VOP3__V_CMP_GE_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ge_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I64
-
-    Inst_VOP3__V_CMP_GE_I64::~Inst_VOP3__V_CMP_GE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I64
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_T_I64::Inst_VOP3__V_CMP_T_I64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_t_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I64
-
-    Inst_VOP3__V_CMP_T_I64::~Inst_VOP3__V_CMP_T_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I64
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_F_U64::Inst_VOP3__V_CMP_F_U64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_f_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U64
-
-    Inst_VOP3__V_CMP_F_U64::~Inst_VOP3__V_CMP_F_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U64
-
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LT_U64::Inst_VOP3__V_CMP_LT_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_lt_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U64
-
-    Inst_VOP3__V_CMP_LT_U64::~Inst_VOP3__V_CMP_LT_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U64
-
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_EQ_U64::Inst_VOP3__V_CMP_EQ_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_eq_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U64
-
-    Inst_VOP3__V_CMP_EQ_U64::~Inst_VOP3__V_CMP_EQ_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U64
-
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_LE_U64::Inst_VOP3__V_CMP_LE_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_le_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U64
-
-    Inst_VOP3__V_CMP_LE_U64::~Inst_VOP3__V_CMP_LE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U64
-
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GT_U64::Inst_VOP3__V_CMP_GT_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_gt_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U64
-
-    Inst_VOP3__V_CMP_GT_U64::~Inst_VOP3__V_CMP_GT_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U64
-
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_NE_U64::Inst_VOP3__V_CMP_NE_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ne_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U64
-
-    Inst_VOP3__V_CMP_NE_U64::~Inst_VOP3__V_CMP_NE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U64
-
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_GE_U64::Inst_VOP3__V_CMP_GE_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_ge_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U64
-
-    Inst_VOP3__V_CMP_GE_U64::~Inst_VOP3__V_CMP_GE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U64
-
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMP_T_U64::Inst_VOP3__V_CMP_T_U64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmp_t_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U64
-
-    Inst_VOP3__V_CMP_T_U64::~Inst_VOP3__V_CMP_T_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U64
-
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_F_I64::Inst_VOP3__V_CMPX_F_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_f_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_F_I64
-
-    Inst_VOP3__V_CMPX_F_I64::~Inst_VOP3__V_CMPX_F_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I64
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LT_I64::Inst_VOP3__V_CMPX_LT_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lt_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LT_I64
-
-    Inst_VOP3__V_CMPX_LT_I64::~Inst_VOP3__V_CMPX_LT_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I64
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_EQ_I64::Inst_VOP3__V_CMPX_EQ_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_eq_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_EQ_I64
-
-    Inst_VOP3__V_CMPX_EQ_I64::~Inst_VOP3__V_CMPX_EQ_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I64
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LE_I64::Inst_VOP3__V_CMPX_LE_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_le_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LE_I64
-
-    Inst_VOP3__V_CMPX_LE_I64::~Inst_VOP3__V_CMPX_LE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I64
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GT_I64::Inst_VOP3__V_CMPX_GT_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_gt_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GT_I64
-
-    Inst_VOP3__V_CMPX_GT_I64::~Inst_VOP3__V_CMPX_GT_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I64
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NE_I64::Inst_VOP3__V_CMPX_NE_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ne_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_NE_I64
-
-    Inst_VOP3__V_CMPX_NE_I64::~Inst_VOP3__V_CMPX_NE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I64
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GE_I64::Inst_VOP3__V_CMPX_GE_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ge_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GE_I64
-
-    Inst_VOP3__V_CMPX_GE_I64::~Inst_VOP3__V_CMPX_GE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I64
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_T_I64::Inst_VOP3__V_CMPX_T_I64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_t_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_T_I64
-
-    Inst_VOP3__V_CMPX_T_I64::~Inst_VOP3__V_CMPX_T_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I64
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_F_U64::Inst_VOP3__V_CMPX_F_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_f_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_F_U64
-
-    Inst_VOP3__V_CMPX_F_U64::~Inst_VOP3__V_CMPX_F_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U64
-
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LT_U64::Inst_VOP3__V_CMPX_LT_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_lt_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LT_U64
-
-    Inst_VOP3__V_CMPX_LT_U64::~Inst_VOP3__V_CMPX_LT_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U64
-
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_EQ_U64::Inst_VOP3__V_CMPX_EQ_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_eq_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_EQ_U64
-
-    Inst_VOP3__V_CMPX_EQ_U64::~Inst_VOP3__V_CMPX_EQ_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U64
-
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_LE_U64::Inst_VOP3__V_CMPX_LE_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_le_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_LE_U64
-
-    Inst_VOP3__V_CMPX_LE_U64::~Inst_VOP3__V_CMPX_LE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U64
-
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GT_U64::Inst_VOP3__V_CMPX_GT_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_gt_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GT_U64
-
-    Inst_VOP3__V_CMPX_GT_U64::~Inst_VOP3__V_CMPX_GT_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U64
-
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_NE_U64::Inst_VOP3__V_CMPX_NE_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ne_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_NE_U64
-
-    Inst_VOP3__V_CMPX_NE_U64::~Inst_VOP3__V_CMPX_NE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U64
-
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_GE_U64::Inst_VOP3__V_CMPX_GE_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_ge_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_GE_U64
-
-    Inst_VOP3__V_CMPX_GE_U64::~Inst_VOP3__V_CMPX_GE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U64
-
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CMPX_T_U64::Inst_VOP3__V_CMPX_T_U64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cmpx_t_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMPX_T_U64
-
-    Inst_VOP3__V_CMPX_T_U64::~Inst_VOP3__V_CMPX_T_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U64
-
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_CNDMASK_B32::Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cndmask_b32", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_CNDMASK_B32
-
-    Inst_VOP3__V_CNDMASK_B32::~Inst_VOP3__V_CNDMASK_B32()
-    {
-    } // ~Inst_VOP3__V_CNDMASK_B32
-
-    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
-    // as a scalar GPR in S2.
-    void
-    Inst_VOP3__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(vcc.rawData(), lane)
-                    ? src1[lane] : src0[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_ADD_F32::Inst_VOP3__V_ADD_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_add_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_ADD_F32
-
-    Inst_VOP3__V_ADD_F32::~Inst_VOP3__V_ADD_F32()
-    {
-    } // ~Inst_VOP3__V_ADD_F32
-
-    // D.f = S0.f + S1.f.
-    void
-    Inst_VOP3__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_SUB_F32::Inst_VOP3__V_SUB_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sub_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SUB_F32
-
-    Inst_VOP3__V_SUB_F32::~Inst_VOP3__V_SUB_F32()
-    {
-    } // ~Inst_VOP3__V_SUB_F32
-
-    // D.f = S0.f - S1.f.
-    void
-    Inst_VOP3__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_SUBREV_F32::Inst_VOP3__V_SUBREV_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_subrev_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SUBREV_F32
-
-    Inst_VOP3__V_SUBREV_F32::~Inst_VOP3__V_SUBREV_F32()
-    {
-    } // ~Inst_VOP3__V_SUBREV_F32
-
-    // D.f = S1.f - S0.f.
-    void
-    Inst_VOP3__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MUL_LEGACY_F32::Inst_VOP3__V_MUL_LEGACY_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MUL_LEGACY_F32
-
-    Inst_VOP3__V_MUL_LEGACY_F32::~Inst_VOP3__V_MUL_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_MUL_LEGACY_F32
-
-    // D.f = S0.f * S1.f
-    void
-    Inst_VOP3__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MUL_F32::Inst_VOP3__V_MUL_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MUL_F32
-
-    Inst_VOP3__V_MUL_F32::~Inst_VOP3__V_MUL_F32()
-    {
-    } // ~Inst_VOP3__V_MUL_F32
-
-    // D.f = S0.f * S1.f.
-    void
-    Inst_VOP3__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MUL_I32_I24::Inst_VOP3__V_MUL_I32_I24(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_i32_i24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_I32_I24
-
-    Inst_VOP3__V_MUL_I32_I24::~Inst_VOP3__V_MUL_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MUL_I32_I24
-
-    // D.i = S0.i[23:0] * S1.i[23:0].
-    void
-    Inst_VOP3__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = szext<24>(src0[lane]) * szext<24>(src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MUL_HI_I32_I24::Inst_VOP3__V_MUL_HI_I32_I24(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_hi_i32_i24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_I32_I24
-
-    Inst_VOP3__V_MUL_HI_I32_I24::~Inst_VOP3__V_MUL_HI_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_I32_I24
-
-    // D.i = (S0.i[23:0] * S1.i[23:0]) >> 32.
-    void
-    Inst_VOP3__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 tmp_src0 = (VecElemI64)szext<24>(src0[lane]);
-                VecElemI64 tmp_src1 = (VecElemI64)szext<24>(src1[lane]);
-
-                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MUL_U32_U24::Inst_VOP3__V_MUL_U32_U24(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_u32_u24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_U32_U24
-
-    Inst_VOP3__V_MUL_U32_U24::~Inst_VOP3__V_MUL_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MUL_U32_U24
-
-    // D.u = S0.u[23:0] * S1.u[23:0].
-    void
-    Inst_VOP3__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MUL_HI_U32_U24::Inst_VOP3__V_MUL_HI_U32_U24(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_hi_u32_u24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_U32_U24
-
-    Inst_VOP3__V_MUL_HI_U32_U24::~Inst_VOP3__V_MUL_HI_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_U32_U24
-
-    // D.i = (S0.u[23:0] * S1.u[23:0]) >> 32.
-    void
-    Inst_VOP3__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
-                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
-                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MIN_F32::Inst_VOP3__V_MIN_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_min_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MIN_F32
-
-    Inst_VOP3__V_MIN_F32::~Inst_VOP3__V_MIN_F32()
-    {
-    } // ~Inst_VOP3__V_MIN_F32
-
-    // D.f = (S0.f < S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP3__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAX_F32::Inst_VOP3__V_MAX_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_max_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MAX_F32
-
-    Inst_VOP3__V_MAX_F32::~Inst_VOP3__V_MAX_F32()
-    {
-    } // ~Inst_VOP3__V_MAX_F32
-
-    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP3__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MIN_I32::Inst_VOP3__V_MIN_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_min_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_I32
-
-    Inst_VOP3__V_MIN_I32::~Inst_VOP3__V_MIN_I32()
-    {
-    } // ~Inst_VOP3__V_MIN_I32
-
-    // D.i = min(S0.i, S1.i).
-    void
-    Inst_VOP3__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAX_I32::Inst_VOP3__V_MAX_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_max_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_I32
-
-    Inst_VOP3__V_MAX_I32::~Inst_VOP3__V_MAX_I32()
-    {
-    } // ~Inst_VOP3__V_MAX_I32
-
-    // D.i = max(S0.i, S1.i).
-    void
-    Inst_VOP3__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MIN_U32::Inst_VOP3__V_MIN_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_min_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_U32
-
-    Inst_VOP3__V_MIN_U32::~Inst_VOP3__V_MIN_U32()
-    {
-    } // ~Inst_VOP3__V_MIN_U32
-
-    // D.u = min(S0.u, S1.u).
-    void
-    Inst_VOP3__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAX_U32::Inst_VOP3__V_MAX_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_max_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_U32
-
-    Inst_VOP3__V_MAX_U32::~Inst_VOP3__V_MAX_U32()
-    {
-    } // ~Inst_VOP3__V_MAX_U32
-
-    // D.u = max(S0.u, S1.u).
-    void
-    Inst_VOP3__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_LSHRREV_B32::Inst_VOP3__V_LSHRREV_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_lshrrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B32
-
-    Inst_VOP3__V_LSHRREV_B32::~Inst_VOP3__V_LSHRREV_B32()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B32
-
-    // D.u = S1.u >> S0.u[4:0].
-    // The vacated bits are set to zero.
-    void
-    Inst_VOP3__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_ASHRREV_I32::Inst_VOP3__V_ASHRREV_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ashrrev_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I32
-
-    Inst_VOP3__V_ASHRREV_I32::~Inst_VOP3__V_ASHRREV_I32()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I32
-
-    // D.i = signext(S1.i) >> S0.i[4:0].
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_VOP3__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_LSHLREV_B32::Inst_VOP3__V_LSHLREV_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_lshlrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B32
-
-    Inst_VOP3__V_LSHLREV_B32::~Inst_VOP3__V_LSHLREV_B32()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B32
-
-    // D.u = S1.u << S0.u[4:0].
-    void
-    Inst_VOP3__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_AND_B32::Inst_VOP3__V_AND_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_and_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_AND_B32
-
-    Inst_VOP3__V_AND_B32::~Inst_VOP3__V_AND_B32()
-    {
-    } // ~Inst_VOP3__V_AND_B32
-
-    // D.u = S0.u & S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] & src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_OR_B32::Inst_VOP3__V_OR_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_or_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_OR_B32
-
-    Inst_VOP3__V_OR_B32::~Inst_VOP3__V_OR_B32()
-    {
-    } // ~Inst_VOP3__V_OR_B32
-
-    // D.u = S0.u | S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] | src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_XOR_B32::Inst_VOP3__V_XOR_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_xor_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_XOR_B32
-
-    Inst_VOP3__V_XOR_B32::~Inst_VOP3__V_XOR_B32()
-    {
-    } // ~Inst_VOP3__V_XOR_B32
-
-    // D.u = S0.u ^ S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] ^ src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAC_F32::Inst_VOP3__V_MAC_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mac_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAC);
-    } // Inst_VOP3__V_MAC_F32
-
-    Inst_VOP3__V_MAC_F32::~Inst_VOP3__V_MAC_F32()
-    {
-    } // ~Inst_VOP3__V_MAC_F32
-
-    // D.f = S0.f * S1.f + D.f.
-    void
-    Inst_VOP3__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vdst.read();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3_SDST_ENC *iFmt)
-        : Inst_VOP3_SDST_ENC(iFmt, "v_add_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_ADD_U32
-
-    Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32()
-    {
-    } // ~Inst_VOP3__V_ADD_U32
-
-    // D.u = S0.u + S1.u;
-    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
-    // overflow or carry-out.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-                vcc.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    }
-
-    Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3_SDST_ENC *iFmt)
-        : Inst_VOP3_SDST_ENC(iFmt, "v_sub_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_SUB_U32
-
-    Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32()
-    {
-    } // ~Inst_VOP3__V_SUB_U32
-
-    // D.u = S0.u - S1.u;
-    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    }
-
-    Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(
-          InFmt_VOP3_SDST_ENC *iFmt)
-        : Inst_VOP3_SDST_ENC(iFmt, "v_subrev_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_SUBREV_U32
-
-    Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32()
-    {
-    } // ~Inst_VOP3__V_SUBREV_U32
-
-    // D.u = S1.u - S0.u;
-    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    }
-
-    Inst_VOP3__V_ADDC_U32::Inst_VOP3__V_ADDC_U32(InFmt_VOP3_SDST_ENC *iFmt)
-        : Inst_VOP3_SDST_ENC(iFmt, "v_addc_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_ADDC_U32
-
-    Inst_VOP3__V_ADDC_U32::~Inst_VOP3__V_ADDC_U32()
-    {
-    } // ~Inst_VOP3__V_ADDC_U32
-
-    // D.u = S0.u + S1.u + VCC[threadId];
-    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x100000000ULL ? 1 : 0)
-    // is an UNSIGNED overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP3__V_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane]
-                    + bits(vcc.rawData(), lane);
-                sdst.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]
-                        + (VecElemU64)bits(vcc.rawData(), lane))
-                            >= 0x100000000 ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_SUBB_U32::Inst_VOP3__V_SUBB_U32(InFmt_VOP3_SDST_ENC *iFmt)
-        : Inst_VOP3_SDST_ENC(iFmt, "v_subb_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_SUBB_U32
-
-    Inst_VOP3__V_SUBB_U32::~Inst_VOP3__V_SUBB_U32()
-    {
-    } // ~Inst_VOP3__V_SUBB_U32
-
-    // D.u = S0.u - S1.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP3__V_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane]
-                    - bits(vcc.rawData(), lane);
-                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_SUBBREV_U32::Inst_VOP3__V_SUBBREV_U32(
-          InFmt_VOP3_SDST_ENC *iFmt)
-        : Inst_VOP3_SDST_ENC(iFmt, "v_subbrev_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_SUBBREV_U32
-
-    Inst_VOP3__V_SUBBREV_U32::~Inst_VOP3__V_SUBBREV_U32()
-    {
-    } // ~Inst_VOP3__V_SUBBREV_U32
-
-    // D.u = S1.u - S0.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP3__V_SUBBREV_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane]
-                    - bits(vcc.rawData(), lane);
-                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    }
-
-    Inst_VOP3__V_ADD_F16::Inst_VOP3__V_ADD_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_add_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_ADD_F16
-
-    Inst_VOP3__V_ADD_F16::~Inst_VOP3__V_ADD_F16()
-    {
-    } // ~Inst_VOP3__V_ADD_F16
-
-    // D.f16 = S0.f16 + S1.f16.
-    void
-    Inst_VOP3__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_SUB_F16::Inst_VOP3__V_SUB_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sub_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SUB_F16
-
-    Inst_VOP3__V_SUB_F16::~Inst_VOP3__V_SUB_F16()
-    {
-    } // ~Inst_VOP3__V_SUB_F16
-
-    // D.f16 = S0.f16 - S1.f16.
-    void
-    Inst_VOP3__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_SUBREV_F16::Inst_VOP3__V_SUBREV_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_subrev_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SUBREV_F16
-
-    Inst_VOP3__V_SUBREV_F16::~Inst_VOP3__V_SUBREV_F16()
-    {
-    } // ~Inst_VOP3__V_SUBREV_F16
-
-    // D.f16 = S1.f16 - S0.f16.
-    void
-    Inst_VOP3__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_MUL_F16::Inst_VOP3__V_MUL_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MUL_F16
-
-    Inst_VOP3__V_MUL_F16::~Inst_VOP3__V_MUL_F16()
-    {
-    } // ~Inst_VOP3__V_MUL_F16
-
-    // D.f16 = S0.f16 * S1.f16.
-    void
-    Inst_VOP3__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_MAC_F16::Inst_VOP3__V_MAC_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mac_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAC);
-    } // Inst_VOP3__V_MAC_F16
-
-    Inst_VOP3__V_MAC_F16::~Inst_VOP3__V_MAC_F16()
-    {
-    } // ~Inst_VOP3__V_MAC_F16
-
-    // D.f16 = S0.f16 * S1.f16 + D.f16.
-    void
-    Inst_VOP3__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_ADD_U16::Inst_VOP3__V_ADD_U16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_add_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD_U16
-
-    Inst_VOP3__V_ADD_U16::~Inst_VOP3__V_ADD_U16()
-    {
-    } // ~Inst_VOP3__V_ADD_U16
-
-    // D.u16 = S0.u16 + S1.u16.
-    void
-    Inst_VOP3__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_SUB_U16::Inst_VOP3__V_SUB_U16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sub_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUB_U16
-
-    Inst_VOP3__V_SUB_U16::~Inst_VOP3__V_SUB_U16()
-    {
-    } // ~Inst_VOP3__V_SUB_U16
-
-    // D.u16 = S0.u16 - S1.u16.
-    void
-    Inst_VOP3__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_SUBREV_U16::Inst_VOP3__V_SUBREV_U16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_subrev_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUBREV_U16
-
-    Inst_VOP3__V_SUBREV_U16::~Inst_VOP3__V_SUBREV_U16()
-    {
-    } // ~Inst_VOP3__V_SUBREV_U16
-
-    // D.u16 = S1.u16 - S0.u16.
-    void
-    Inst_VOP3__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MUL_LO_U16::Inst_VOP3__V_MUL_LO_U16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_lo_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_LO_U16
-
-    Inst_VOP3__V_MUL_LO_U16::~Inst_VOP3__V_MUL_LO_U16()
-    {
-    } // ~Inst_VOP3__V_MUL_LO_U16
-
-    // D.u16 = S0.u16 * S1.u16.
-    void
-    Inst_VOP3__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_LSHLREV_B16::Inst_VOP3__V_LSHLREV_B16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_lshlrev_b16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B16
-
-    Inst_VOP3__V_LSHLREV_B16::~Inst_VOP3__V_LSHLREV_B16()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B16
-
-    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
-    void
-    Inst_VOP3__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_LSHRREV_B16::Inst_VOP3__V_LSHRREV_B16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_lshrrev_b16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B16
-
-    Inst_VOP3__V_LSHRREV_B16::~Inst_VOP3__V_LSHRREV_B16()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B16
-
-    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
-    // The vacated bits are set to zero.
-    void
-    Inst_VOP3__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_ASHRREV_I16::Inst_VOP3__V_ASHRREV_I16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ashrrev_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I16
-
-    Inst_VOP3__V_ASHRREV_I16::~Inst_VOP3__V_ASHRREV_I16()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I16
-
-    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_VOP3__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAX_F16::Inst_VOP3__V_MAX_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_max_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MAX_F16
-
-    Inst_VOP3__V_MAX_F16::~Inst_VOP3__V_MAX_F16()
-    {
-    } // ~Inst_VOP3__V_MAX_F16
-
-    // D.f16 = max(S0.f16, S1.f16).
-    void
-    Inst_VOP3__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_MIN_F16::Inst_VOP3__V_MIN_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_min_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MIN_F16
-
-    Inst_VOP3__V_MIN_F16::~Inst_VOP3__V_MIN_F16()
-    {
-    } // ~Inst_VOP3__V_MIN_F16
-
-    // D.f16 = min(S0.f16, S1.f16).
-    void
-    Inst_VOP3__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_MAX_U16::Inst_VOP3__V_MAX_U16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_max_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_U16
-
-    Inst_VOP3__V_MAX_U16::~Inst_VOP3__V_MAX_U16()
-    {
-    } // ~Inst_VOP3__V_MAX_U16
-
-    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP3__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAX_I16::Inst_VOP3__V_MAX_I16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_max_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_I16
-
-    Inst_VOP3__V_MAX_I16::~Inst_VOP3__V_MAX_I16()
-    {
-    } // ~Inst_VOP3__V_MAX_I16
-
-    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP3__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MIN_U16::Inst_VOP3__V_MIN_U16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_min_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_U16
-
-    Inst_VOP3__V_MIN_U16::~Inst_VOP3__V_MIN_U16()
-    {
-    } // ~Inst_VOP3__V_MIN_U16
-
-    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP3__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MIN_I16::Inst_VOP3__V_MIN_I16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_min_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_I16
-
-    Inst_VOP3__V_MIN_I16::~Inst_VOP3__V_MIN_I16()
-    {
-    } // ~Inst_VOP3__V_MIN_I16
-
-    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP3__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_LDEXP_F16::Inst_VOP3__V_LDEXP_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ldexp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_LDEXP_F16
-
-    Inst_VOP3__V_LDEXP_F16::~Inst_VOP3__V_LDEXP_F16()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F16
-
-    // D.f16 = S0.f16 * (2 ** S1.i16).
-    void
-    Inst_VOP3__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_nop", false)
-    {
-        setFlag(Nop);
-        setFlag(ALU);
-    } // Inst_VOP3__V_NOP
-
-    Inst_VOP3__V_NOP::~Inst_VOP3__V_NOP()
-    {
-    } // ~Inst_VOP3__V_NOP
-
-    // Do nothing.
-    void
-    Inst_VOP3__V_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_VOP3__V_MOV_B32::Inst_VOP3__V_MOV_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mov_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MOV_B32
-
-    Inst_VOP3__V_MOV_B32::~Inst_VOP3__V_MOV_B32()
-    {
-    } // ~Inst_VOP3__V_MOV_B32
-
-    // D.u = S0.u.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_I32_F64::Inst_VOP3__V_CVT_I32_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_i32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_I32_F64
-
-    Inst_VOP3__V_CVT_I32_F64::~Inst_VOP3__V_CVT_I32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_I32_F64
-
-    // D.i = (int)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN
-    // is converted to 0.
-    void
-    Inst_VOP3__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_F64_I32::Inst_VOP3__V_CVT_F64_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f64_i32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_I32
-
-    Inst_VOP3__V_CVT_F64_I32::~Inst_VOP3__V_CVT_F64_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_I32
-
-    // D.d = (double)S0.i.
-    void
-    Inst_VOP3__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_F32_I32::Inst_VOP3__V_CVT_F32_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f32_i32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_I32
-
-    Inst_VOP3__V_CVT_F32_I32::~Inst_VOP3__V_CVT_F32_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_I32
-
-    // D.f = (float)S0.i.
-    void
-    Inst_VOP3__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        VecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_F32_U32::Inst_VOP3__V_CVT_F32_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f32_u32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_U32
-
-    Inst_VOP3__V_CVT_F32_U32::~Inst_VOP3__V_CVT_F32_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_U32
-
-    // D.f = (float)S0.u.
-    void
-    Inst_VOP3__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_U32_F32::Inst_VOP3__V_CVT_U32_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_u32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_U32_F32
-
-    Inst_VOP3__V_CVT_U32_F32::~Inst_VOP3__V_CVT_U32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_U32_F32
-
-    // D.u = (unsigned)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN
-    // is converted to 0.
-    void
-    Inst_VOP3__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_I32_F32::Inst_VOP3__V_CVT_I32_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_I32_F32
-
-    Inst_VOP3__V_CVT_I32_F32::~Inst_VOP3__V_CVT_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_I32_F32
-
-    // D.i = (int)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN
-    // is converted to 0.
-    void
-    Inst_VOP3__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MOV_FED_B32::Inst_VOP3__V_MOV_FED_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mov_fed_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MOV_FED_B32
-
-    Inst_VOP3__V_MOV_FED_B32::~Inst_VOP3__V_MOV_FED_B32()
-    {
-    } // ~Inst_VOP3__V_MOV_FED_B32
-
-    // D.u = S0.u;
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_F16_F32::Inst_VOP3__V_CVT_F16_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F16_F32
-
-    Inst_VOP3__V_CVT_F16_F32::~Inst_VOP3__V_CVT_F16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_F32
-
-    // D.f16 = flt32_to_flt16(S0.f).
-    void
-    Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_F32_F16::Inst_VOP3__V_CVT_F32_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f32_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_F16
-
-    Inst_VOP3__V_CVT_F32_F16::~Inst_VOP3__V_CVT_F32_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_F16
-
-    // D.f = flt16_to_flt32(S0.f16).
-    void
-    Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_RPI_I32_F32::Inst_VOP3__V_CVT_RPI_I32_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_rpi_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_RPI_I32_F32
-
-    Inst_VOP3__V_CVT_RPI_I32_F32::~Inst_VOP3__V_CVT_RPI_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_RPI_I32_F32
-
-    // D.i = (int)floor(S0.f + 0.5).
-    void
-    Inst_VOP3__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_FLR_I32_F32::Inst_VOP3__V_CVT_FLR_I32_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_flr_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_FLR_I32_F32
-
-    Inst_VOP3__V_CVT_FLR_I32_F32::~Inst_VOP3__V_CVT_FLR_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_FLR_I32_F32
-
-    // D.i = (int)floor(S0.f).
-    void
-    Inst_VOP3__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_OFF_F32_I4::Inst_VOP3__V_CVT_OFF_F32_I4(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_off_f32_i4", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_OFF_F32_I4
-
-    Inst_VOP3__V_CVT_OFF_F32_I4::~Inst_VOP3__V_CVT_OFF_F32_I4()
-    {
-    } // ~Inst_VOP3__V_CVT_OFF_F32_I4
-
-    // 4-bit signed int to 32-bit float.
-    void
-    Inst_VOP3__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_F32_F64::Inst_VOP3__V_CVT_F32_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F32_F64
-
-    Inst_VOP3__V_CVT_F32_F64::~Inst_VOP3__V_CVT_F32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_F64
-
-    // D.f = (float)S0.d.
-    void
-    Inst_VOP3__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_F64_F32::Inst_VOP3__V_CVT_F64_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f64_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_F32
-
-    Inst_VOP3__V_CVT_F64_F32::~Inst_VOP3__V_CVT_F64_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_F32
-
-    // D.d = (double)S0.f.
-    void
-    Inst_VOP3__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_F32_UBYTE0::Inst_VOP3__V_CVT_F32_UBYTE0(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f32_ubyte0", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE0
-
-    Inst_VOP3__V_CVT_F32_UBYTE0::~Inst_VOP3__V_CVT_F32_UBYTE0()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE0
-
-    // D.f = (float)(S0.u[7:0]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_F32_UBYTE1::Inst_VOP3__V_CVT_F32_UBYTE1(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f32_ubyte1", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE1
-
-    Inst_VOP3__V_CVT_F32_UBYTE1::~Inst_VOP3__V_CVT_F32_UBYTE1()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE1
-
-    // D.f = (float)(S0.u[15:8]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_F32_UBYTE2::Inst_VOP3__V_CVT_F32_UBYTE2(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f32_ubyte2", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE2
-
-    Inst_VOP3__V_CVT_F32_UBYTE2::~Inst_VOP3__V_CVT_F32_UBYTE2()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE2
-
-    // D.f = (float)(S0.u[23:16]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_F32_UBYTE3::Inst_VOP3__V_CVT_F32_UBYTE3(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f32_ubyte3", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE3
-
-    Inst_VOP3__V_CVT_F32_UBYTE3::~Inst_VOP3__V_CVT_F32_UBYTE3()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE3
-
-    // D.f = (float)(S0.u[31:24]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_U32_F64::Inst_VOP3__V_CVT_U32_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_u32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_U32_F64
-
-    Inst_VOP3__V_CVT_U32_F64::~Inst_VOP3__V_CVT_U32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_U32_F64
-
-    // D.u = (unsigned)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN
-    // is converted to 0.
-    void
-    Inst_VOP3__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_F64_U32::Inst_VOP3__V_CVT_F64_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f64_u32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_U32
-
-    Inst_VOP3__V_CVT_F64_U32::~Inst_VOP3__V_CVT_F64_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_U32
-
-    // D.d = (double)S0.u.
-    void
-    Inst_VOP3__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_TRUNC_F64::Inst_VOP3__V_TRUNC_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_trunc_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_TRUNC_F64
-
-    Inst_VOP3__V_TRUNC_F64::~Inst_VOP3__V_TRUNC_F64()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F64
-
-    // D.d = trunc(S0.d), return integer part of S0.d.
-    void
-    Inst_VOP3__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CEIL_F64::Inst_VOP3__V_CEIL_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ceil_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CEIL_F64
-
-    Inst_VOP3__V_CEIL_F64::~Inst_VOP3__V_CEIL_F64()
-    {
-    } // ~Inst_VOP3__V_CEIL_F64
-
-    // D.d = ceil(S0.d);
-    void
-    Inst_VOP3__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_RNDNE_F64::Inst_VOP3__V_RNDNE_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_rndne_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RNDNE_F64
-
-    Inst_VOP3__V_RNDNE_F64::~Inst_VOP3__V_RNDNE_F64()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F64
-
-    // D.d = round_nearest_even(S0.d).
-    void
-    Inst_VOP3__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FLOOR_F64::Inst_VOP3__V_FLOOR_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_floor_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FLOOR_F64
-
-    Inst_VOP3__V_FLOOR_F64::~Inst_VOP3__V_FLOOR_F64()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F64
-
-    // D.d = floor(S0.d);
-    void
-    Inst_VOP3__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FRACT_F32::Inst_VOP3__V_FRACT_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_fract_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FRACT_F32
-
-    Inst_VOP3__V_FRACT_F32::~Inst_VOP3__V_FRACT_F32()
-    {
-    } // ~Inst_VOP3__V_FRACT_F32
-
-    // D.f = modf(S0.f).
-    void
-    Inst_VOP3__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_TRUNC_F32::Inst_VOP3__V_TRUNC_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_trunc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_TRUNC_F32
-
-    Inst_VOP3__V_TRUNC_F32::~Inst_VOP3__V_TRUNC_F32()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F32
-
-    // D.f = trunc(S0.f), return integer part of S0.f.
-    void
-    Inst_VOP3__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CEIL_F32::Inst_VOP3__V_CEIL_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ceil_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CEIL_F32
-
-    Inst_VOP3__V_CEIL_F32::~Inst_VOP3__V_CEIL_F32()
-    {
-    } // ~Inst_VOP3__V_CEIL_F32
-
-    // D.f = ceil(S0.f);
-    void
-    Inst_VOP3__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_RNDNE_F32::Inst_VOP3__V_RNDNE_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_rndne_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RNDNE_F32
-
-    Inst_VOP3__V_RNDNE_F32::~Inst_VOP3__V_RNDNE_F32()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F32
-
-    // D.f = round_nearest_even(S0.f).
-    void
-    Inst_VOP3__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FLOOR_F32::Inst_VOP3__V_FLOOR_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_floor_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FLOOR_F32
-
-    Inst_VOP3__V_FLOOR_F32::~Inst_VOP3__V_FLOOR_F32()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F32
-
-    // D.f = floor(S0.f);
-    void
-    Inst_VOP3__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_EXP_F32::Inst_VOP3__V_EXP_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_exp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_EXP_F32
-
-    Inst_VOP3__V_EXP_F32::~Inst_VOP3__V_EXP_F32()
-    {
-    } // ~Inst_VOP3__V_EXP_F32
-
-    // D.f = pow(2.0, S0.f).
-    void
-    Inst_VOP3__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_LOG_F32::Inst_VOP3__V_LOG_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_log_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LOG_F32
-
-    Inst_VOP3__V_LOG_F32::~Inst_VOP3__V_LOG_F32()
-    {
-    } // ~Inst_VOP3__V_LOG_F32
-
-    // D.f = log2(S0.f).
-    void
-    Inst_VOP3__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_RCP_F32::Inst_VOP3__V_RCP_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_rcp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RCP_F32
-
-    Inst_VOP3__V_RCP_F32::~Inst_VOP3__V_RCP_F32()
-    {
-    } // ~Inst_VOP3__V_RCP_F32
-
-    // D.f = 1.0 / S0.f.
-    void
-    Inst_VOP3__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_RCP_IFLAG_F32::Inst_VOP3__V_RCP_IFLAG_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_rcp_iflag_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RCP_IFLAG_F32
-
-    Inst_VOP3__V_RCP_IFLAG_F32::~Inst_VOP3__V_RCP_IFLAG_F32()
-    {
-    } // ~Inst_VOP3__V_RCP_IFLAG_F32
-
-    // D.f = 1.0 / S0.f.
-    void
-    Inst_VOP3__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_RSQ_F32::Inst_VOP3__V_RSQ_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_rsq_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RSQ_F32
-
-    Inst_VOP3__V_RSQ_F32::~Inst_VOP3__V_RSQ_F32()
-    {
-    } // ~Inst_VOP3__V_RSQ_F32
-
-    // D.f = 1.0 / sqrt(S0.f).
-    void
-    Inst_VOP3__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_RCP_F64::Inst_VOP3__V_RCP_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_rcp_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RCP_F64
-
-    Inst_VOP3__V_RCP_F64::~Inst_VOP3__V_RCP_F64()
-    {
-    } // ~Inst_VOP3__V_RCP_F64
-
-    // D.d = 1.0 / S0.d.
-    void
-    Inst_VOP3__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = 0.0;
-                    }
-                } else {
-                    vdst[lane] = 1.0 / src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_RSQ_F64::Inst_VOP3__V_RSQ_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_rsq_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RSQ_F64
-
-    Inst_VOP3__V_RSQ_F64::~Inst_VOP3__V_RSQ_F64()
-    {
-    } // ~Inst_VOP3__V_RSQ_F64
-
-    // D.d = 1.0 / sqrt(S0.d).
-    void
-    Inst_VOP3__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
-                    vdst[lane] = 0.0;
-                } else if (std::signbit(src[lane])) {
-                    vdst[lane] = NAN;
-                } else {
-                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_SQRT_F32::Inst_VOP3__V_SQRT_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sqrt_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SQRT_F32
-
-    Inst_VOP3__V_SQRT_F32::~Inst_VOP3__V_SQRT_F32()
-    {
-    } // ~Inst_VOP3__V_SQRT_F32
-
-    // D.f = sqrt(S0.f).
-    void
-    Inst_VOP3__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_SQRT_F64::Inst_VOP3__V_SQRT_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sqrt_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_SQRT_F64
-
-    Inst_VOP3__V_SQRT_F64::~Inst_VOP3__V_SQRT_F64()
-    {
-    } // ~Inst_VOP3__V_SQRT_F64
-
-    // D.d = sqrt(S0.d).
-    void
-    Inst_VOP3__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_SIN_F32::Inst_VOP3__V_SIN_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sin_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SIN_F32
-
-    Inst_VOP3__V_SIN_F32::~Inst_VOP3__V_SIN_F32()
-    {
-    } // ~Inst_VOP3__V_SIN_F32
-
-    // D.f = sin(S0.f * 2 * PI).
-    void
-    Inst_VOP3__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_COS_F32::Inst_VOP3__V_COS_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cos_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_COS_F32
-
-    Inst_VOP3__V_COS_F32::~Inst_VOP3__V_COS_F32()
-    {
-    } // ~Inst_VOP3__V_COS_F32
-
-    // D.f = cos(S0.f * 2 * PI).
-    void
-    Inst_VOP3__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_NOT_B32::Inst_VOP3__V_NOT_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_not_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_NOT_B32
-
-    Inst_VOP3__V_NOT_B32::~Inst_VOP3__V_NOT_B32()
-    {
-    } // ~Inst_VOP3__V_NOT_B32
-
-    // D.u = ~S0.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ~src[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_BFREV_B32::Inst_VOP3__V_BFREV_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_bfrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFREV_B32
-
-    Inst_VOP3__V_BFREV_B32::~Inst_VOP3__V_BFREV_B32()
-    {
-    } // ~Inst_VOP3__V_BFREV_B32
-
-    // D.u[31:0] = S0.u[0:31], bitfield reverse.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = reverseBits(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FFBH_U32::Inst_VOP3__V_FFBH_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ffbh_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBH_U32
-
-    Inst_VOP3__V_FFBH_U32::~Inst_VOP3__V_FFBH_U32()
-    {
-    } // ~Inst_VOP3__V_FFBH_U32
-
-    // D.u = position of first 1 in S0.u from MSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP3__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOneMsb(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FFBL_B32::Inst_VOP3__V_FFBL_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ffbl_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBL_B32
-
-    Inst_VOP3__V_FFBL_B32::~Inst_VOP3__V_FFBL_B32()
-    {
-    } // ~Inst_VOP3__V_FFBL_B32
-
-    // D.u = position of first 1 in S0.u from LSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP3__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOne(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FFBH_I32::Inst_VOP3__V_FFBH_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ffbh_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBH_I32
-
-    Inst_VOP3__V_FFBH_I32::~Inst_VOP3__V_FFBH_I32()
-    {
-    } // ~Inst_VOP3__V_FFBH_I32
-
-    // D.u = position of first bit different from sign bit in S0.i from MSB;
-    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
-    void
-    Inst_VOP3__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = firstOppositeSignBit(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FREXP_EXP_I32_F64::Inst_VOP3__V_FREXP_EXP_I32_F64(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_frexp_exp_i32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FREXP_EXP_I32_F64
-
-    Inst_VOP3__V_FREXP_EXP_I32_F64::~Inst_VOP3__V_FREXP_EXP_I32_F64()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I32_F64
-
-    // See V_FREXP_EXP_I32_F32.
-    void
-    Inst_VOP3__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FREXP_MANT_F64::Inst_VOP3__V_FREXP_MANT_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_frexp_mant_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FREXP_MANT_F64
-
-    Inst_VOP3__V_FREXP_MANT_F64::~Inst_VOP3__V_FREXP_MANT_F64()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F64
-
-    void
-    Inst_VOP3__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 exp(0);
-                vdst[lane] = std::frexp(src[lane], &exp);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FRACT_F64::Inst_VOP3__V_FRACT_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_fract_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FRACT_F64
-
-    Inst_VOP3__V_FRACT_F64::~Inst_VOP3__V_FRACT_F64()
-    {
-    } // ~Inst_VOP3__V_FRACT_F64
-
-    void
-    Inst_VOP3__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FREXP_EXP_I32_F32::Inst_VOP3__V_FREXP_EXP_I32_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_frexp_exp_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FREXP_EXP_I32_F32
-
-    Inst_VOP3__V_FREXP_EXP_I32_F32::~Inst_VOP3__V_FREXP_EXP_I32_F32()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I32_F32
-
-    // frexp(S0.f, Exponenti(S0.f))
-    // if (S0.f == INF || S0.f == NAN) then D.i = 0;
-    // else D.i = Exponent(S0.f)
-    void
-    Inst_VOP3__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane])|| std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FREXP_MANT_F32::Inst_VOP3__V_FREXP_MANT_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_frexp_mant_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FREXP_MANT_F32
-
-    Inst_VOP3__V_FREXP_MANT_F32::~Inst_VOP3__V_FREXP_MANT_F32()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F32
-
-    // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
-    // else D.f = Mantissa(S0.f).
-    void
-    Inst_VOP3__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CLREXCP::Inst_VOP3__V_CLREXCP(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_clrexcp", false)
-    {
-    } // Inst_VOP3__V_CLREXCP
-
-    Inst_VOP3__V_CLREXCP::~Inst_VOP3__V_CLREXCP()
-    {
-    } // ~Inst_VOP3__V_CLREXCP
-
-    void
-    Inst_VOP3__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_F16_U16::Inst_VOP3__V_CVT_F16_U16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f16_u16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_F16_U16
-
-    Inst_VOP3__V_CVT_F16_U16::~Inst_VOP3__V_CVT_F16_U16()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_U16
-
-    // D.f16 = uint16_to_flt16(S.u16).
-    void
-    Inst_VOP3__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_F16_I16::Inst_VOP3__V_CVT_F16_I16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_f16_i16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_F16_I16
-
-    Inst_VOP3__V_CVT_F16_I16::~Inst_VOP3__V_CVT_F16_I16()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_I16
-
-    // D.f16 = int16_to_flt16(S.i16).
-    void
-    Inst_VOP3__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_U16_F16::Inst_VOP3__V_CVT_U16_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_u16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_U16_F16
-
-    Inst_VOP3__V_CVT_U16_F16::~Inst_VOP3__V_CVT_U16_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_U16_F16
-
-    // D.u16 = flt16_to_uint16(S.f16).
-    void
-    Inst_VOP3__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_I16_F16::Inst_VOP3__V_CVT_I16_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_i16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_I16_F16
-
-    Inst_VOP3__V_CVT_I16_F16::~Inst_VOP3__V_CVT_I16_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_I16_F16
-
-    // D.i16 = flt16_to_int16(S.f16).
-    void
-    Inst_VOP3__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_RCP_F16::Inst_VOP3__V_RCP_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_rcp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RCP_F16
-
-    Inst_VOP3__V_RCP_F16::~Inst_VOP3__V_RCP_F16()
-    {
-    } // ~Inst_VOP3__V_RCP_F16
-
-    // if (S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = 1 / S0.f16.
-    void
-    Inst_VOP3__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_SQRT_F16::Inst_VOP3__V_SQRT_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sqrt_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SQRT_F16
-
-    Inst_VOP3__V_SQRT_F16::~Inst_VOP3__V_SQRT_F16()
-    {
-    } // ~Inst_VOP3__V_SQRT_F16
-
-    // if (S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = sqrt(S0.f16).
-    void
-    Inst_VOP3__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_RSQ_F16::Inst_VOP3__V_RSQ_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_rsq_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RSQ_F16
-
-    Inst_VOP3__V_RSQ_F16::~Inst_VOP3__V_RSQ_F16()
-    {
-    } // ~Inst_VOP3__V_RSQ_F16
-
-    // if (S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = 1 / sqrt(S0.f16).
-    void
-    Inst_VOP3__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_LOG_F16::Inst_VOP3__V_LOG_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_log_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_LOG_F16
-
-    Inst_VOP3__V_LOG_F16::~Inst_VOP3__V_LOG_F16()
-    {
-    } // ~Inst_VOP3__V_LOG_F16
-
-    // if (S0.f16 == 1.0f)
-    //     D.f16 = 0.0f;
-    // else
-    //     D.f16 = log2(S0.f16).
-    void
-    Inst_VOP3__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_EXP_F16::Inst_VOP3__V_EXP_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_exp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_EXP_F16
-
-    Inst_VOP3__V_EXP_F16::~Inst_VOP3__V_EXP_F16()
-    {
-    } // ~Inst_VOP3__V_EXP_F16
-
-    // if (S0.f16 == 0.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = pow(2.0, S0.f16).
-    void
-    Inst_VOP3__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_FREXP_MANT_F16::Inst_VOP3__V_FREXP_MANT_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_frexp_mant_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FREXP_MANT_F16
-
-    Inst_VOP3__V_FREXP_MANT_F16::~Inst_VOP3__V_FREXP_MANT_F16()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F16
-
-    // if (S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.f16 = S0.f16;
-    // else
-    //     D.f16 = mantissa(S0.f16).
-    void
-    Inst_VOP3__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_FREXP_EXP_I16_F16::Inst_VOP3__V_FREXP_EXP_I16_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_frexp_exp_i16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FREXP_EXP_I16_F16
-
-    Inst_VOP3__V_FREXP_EXP_I16_F16::~Inst_VOP3__V_FREXP_EXP_I16_F16()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I16_F16
-
-    void
-    Inst_VOP3__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_FLOOR_F16::Inst_VOP3__V_FLOOR_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_floor_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FLOOR_F16
-
-    Inst_VOP3__V_FLOOR_F16::~Inst_VOP3__V_FLOOR_F16()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F16
-
-    // D.f16 = floor(S0.f16);
-    void
-    Inst_VOP3__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CEIL_F16::Inst_VOP3__V_CEIL_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ceil_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CEIL_F16
-
-    Inst_VOP3__V_CEIL_F16::~Inst_VOP3__V_CEIL_F16()
-    {
-    } // ~Inst_VOP3__V_CEIL_F16
-
-    // D.f16 = ceil(S0.f16);
-    void
-    Inst_VOP3__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_TRUNC_F16::Inst_VOP3__V_TRUNC_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_trunc_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_TRUNC_F16
-
-    Inst_VOP3__V_TRUNC_F16::~Inst_VOP3__V_TRUNC_F16()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F16
-
-    // D.f16 = trunc(S0.f16).
-    void
-    Inst_VOP3__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_RNDNE_F16::Inst_VOP3__V_RNDNE_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_rndne_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RNDNE_F16
-
-    Inst_VOP3__V_RNDNE_F16::~Inst_VOP3__V_RNDNE_F16()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F16
-
-    // D.f16 = roundNearestEven(S0.f16);
-    void
-    Inst_VOP3__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_FRACT_F16::Inst_VOP3__V_FRACT_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_fract_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FRACT_F16
-
-    Inst_VOP3__V_FRACT_F16::~Inst_VOP3__V_FRACT_F16()
-    {
-    } // ~Inst_VOP3__V_FRACT_F16
-
-    // D.f16 = S0.f16 + -floor(S0.f16).
-    void
-    Inst_VOP3__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_SIN_F16::Inst_VOP3__V_SIN_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sin_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SIN_F16
-
-    Inst_VOP3__V_SIN_F16::~Inst_VOP3__V_SIN_F16()
-    {
-    } // ~Inst_VOP3__V_SIN_F16
-
-    // D.f16 = sin(S0.f16 * 2 * PI).
-    void
-    Inst_VOP3__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_COS_F16::Inst_VOP3__V_COS_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cos_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_COS_F16
-
-    Inst_VOP3__V_COS_F16::~Inst_VOP3__V_COS_F16()
-    {
-    } // ~Inst_VOP3__V_COS_F16
-
-    // D.f16 = cos(S0.f16 * 2 * PI).
-    void
-    Inst_VOP3__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_EXP_LEGACY_F32::Inst_VOP3__V_EXP_LEGACY_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_exp_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_EXP_LEGACY_F32
-
-    Inst_VOP3__V_EXP_LEGACY_F32::~Inst_VOP3__V_EXP_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_EXP_LEGACY_F32
-
-    // D.f = pow(2.0, S0.f)
-    void
-    Inst_VOP3__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_LOG_LEGACY_F32::Inst_VOP3__V_LOG_LEGACY_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_log_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LOG_LEGACY_F32
-
-    Inst_VOP3__V_LOG_LEGACY_F32::~Inst_VOP3__V_LOG_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_LOG_LEGACY_F32
-
-    // D.f = log2(S0.f).
-    void
-    Inst_VOP3__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAD_LEGACY_F32::Inst_VOP3__V_MAD_LEGACY_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mad_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_LEGACY_F32
-
-    Inst_VOP3__V_MAD_LEGACY_F32::~Inst_VOP3__V_MAD_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_MAD_LEGACY_F32
-
-    // D.f = S0.f * S1.f + S2.f
-    void
-    Inst_VOP3__V_MAD_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAD_F32::Inst_VOP3__V_MAD_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mad_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_F32
-
-    Inst_VOP3__V_MAD_F32::~Inst_VOP3__V_MAD_F32()
-    {
-    } // ~Inst_VOP3__V_MAD_F32
-
-    // D.f = S0.f * S1.f + S2.f.
-    void
-    Inst_VOP3__V_MAD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAD_I32_I24::Inst_VOP3__V_MAD_I32_I24(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mad_i32_i24", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I32_I24
-
-    Inst_VOP3__V_MAD_I32_I24::~Inst_VOP3__V_MAD_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MAD_I32_I24
-
-    // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
-    void
-    Inst_VOP3__V_MAD_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = szext<24>(src0[lane])
-                    * szext<24>(src1[lane]) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAD_U32_U24::Inst_VOP3__V_MAD_U32_U24(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mad_u32_u24", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U32_U24
-
-    Inst_VOP3__V_MAD_U32_U24::~Inst_VOP3__V_MAD_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MAD_U32_U24
-
-    // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
-    void
-    Inst_VOP3__V_MAD_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
-                    + src2[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CUBEID_F32::Inst_VOP3__V_CUBEID_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cubeid_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBEID_F32
-
-    Inst_VOP3__V_CUBEID_F32::~Inst_VOP3__V_CUBEID_F32()
-    {
-    } // ~Inst_VOP3__V_CUBEID_F32
-
-    void
-    Inst_VOP3__V_CUBEID_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CUBESC_F32::Inst_VOP3__V_CUBESC_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cubesc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBESC_F32
-
-    Inst_VOP3__V_CUBESC_F32::~Inst_VOP3__V_CUBESC_F32()
-    {
-    } // ~Inst_VOP3__V_CUBESC_F32
-
-    void
-    Inst_VOP3__V_CUBESC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CUBETC_F32::Inst_VOP3__V_CUBETC_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cubetc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBETC_F32
-
-    Inst_VOP3__V_CUBETC_F32::~Inst_VOP3__V_CUBETC_F32()
-    {
-    } // ~Inst_VOP3__V_CUBETC_F32
-
-    void
-    Inst_VOP3__V_CUBETC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CUBEMA_F32::Inst_VOP3__V_CUBEMA_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cubema_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBEMA_F32
-
-    Inst_VOP3__V_CUBEMA_F32::~Inst_VOP3__V_CUBEMA_F32()
-    {
-    } // ~Inst_VOP3__V_CUBEMA_F32
-
-    void
-    Inst_VOP3__V_CUBEMA_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_BFE_U32::Inst_VOP3__V_BFE_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_bfe_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFE_U32
-
-    Inst_VOP3__V_BFE_U32::~Inst_VOP3__V_BFE_U32()
-    {
-    } // ~Inst_VOP3__V_BFE_U32
-
-    // D.u = (S0.u >> S1.u[4:0]) & ((1 << S2.u[4:0]) - 1).
-    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
-    void
-    Inst_VOP3__V_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
-                    & ((1 << bits(src2[lane], 4, 0)) - 1);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_BFE_I32::Inst_VOP3__V_BFE_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_bfe_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFE_I32
-
-    Inst_VOP3__V_BFE_I32::~Inst_VOP3__V_BFE_I32()
-    {
-    } // ~Inst_VOP3__V_BFE_I32
-
-    // D.i = (S0.i >> S1.u[4:0]) & ((1 << S2.u[4:0]) - 1).
-    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
-    void
-    Inst_VOP3__V_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
-                    & ((1 << bits(src2[lane], 4, 0)) - 1);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_BFI_B32::Inst_VOP3__V_BFI_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_bfi_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFI_B32
-
-    Inst_VOP3__V_BFI_B32::~Inst_VOP3__V_BFI_B32()
-    {
-    } // ~Inst_VOP3__V_BFI_B32
-
-    // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
-    void
-    Inst_VOP3__V_BFI_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
-                    & src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FMA_F32::Inst_VOP3__V_FMA_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_fma_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F32
-
-    Inst_VOP3__V_FMA_F32::~Inst_VOP3__V_FMA_F32()
-    {
-    } // ~Inst_VOP3__V_FMA_F32
-
-    // D.f = S0.f * S1.f + S2.f.
-    void
-    Inst_VOP3__V_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FMA_F64::Inst_VOP3__V_FMA_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_fma_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F64
-
-    Inst_VOP3__V_FMA_F64::~Inst_VOP3__V_FMA_F64()
-    {
-    } // ~Inst_VOP3__V_FMA_F64
-
-    // D.d = S0.d * S1.d + S2.d.
-    void
-    Inst_VOP3__V_FMA_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_LERP_U8::Inst_VOP3__V_LERP_U8(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_lerp_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LERP_U8
-
-    Inst_VOP3__V_LERP_U8::~Inst_VOP3__V_LERP_U8()
-    {
-    } // ~Inst_VOP3__V_LERP_U8
-
-    // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
-    // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
-    // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
-    // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
-    void
-    Inst_VOP3__V_LERP_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ((bits(src0[lane], 31, 24)
-                    + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
-                        << 24;
-                vdst[lane] += ((bits(src0[lane], 23, 16)
-                    + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
-                        << 16;
-                vdst[lane] += ((bits(src0[lane], 15, 8)
-                    + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
-                        << 8;
-                vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
-                    + bits(src2[lane], 0)) >> 1);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_ALIGNBIT_B32::Inst_VOP3__V_ALIGNBIT_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_alignbit_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ALIGNBIT_B32
-
-    Inst_VOP3__V_ALIGNBIT_B32::~Inst_VOP3__V_ALIGNBIT_B32()
-    {
-    } // ~Inst_VOP3__V_ALIGNBIT_B32
-
-    // D.u = ({S0, S1} >> S2.u[4:0]) & 0xffffffff.
-    void
-    Inst_VOP3__V_ALIGNBIT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
-                    | (VecElemU64)src1[lane]);
-                vdst[lane] = (VecElemU32)((src_0_1
-                    >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_ALIGNBYTE_B32::Inst_VOP3__V_ALIGNBYTE_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_alignbyte_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ALIGNBYTE_B32
-
-    Inst_VOP3__V_ALIGNBYTE_B32::~Inst_VOP3__V_ALIGNBYTE_B32()
-    {
-    } // ~Inst_VOP3__V_ALIGNBYTE_B32
-
-    // D.u = ({S0, S1} >> (8 * S2.u[4:0])) & 0xffffffff.
-    void
-    Inst_VOP3__V_ALIGNBYTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
-                    | (VecElemU64)src1[lane]);
-                vdst[lane] = (VecElemU32)((src_0_1
-                    >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
-                        & 0xffffffff);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MIN3_F32::Inst_VOP3__V_MIN3_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_min3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MIN3_F32
-
-    Inst_VOP3__V_MIN3_F32::~Inst_VOP3__V_MIN3_F32()
-    {
-    } // ~Inst_VOP3__V_MIN3_F32
-
-    // D.f = min(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MIN3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
-                vdst[lane] = std::fmin(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MIN3_I32::Inst_VOP3__V_MIN3_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_min3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN3_I32
-
-    Inst_VOP3__V_MIN3_I32::~Inst_VOP3__V_MIN3_I32()
-    {
-    } // ~Inst_VOP3__V_MIN3_I32
-
-    // D.i = min(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MIN3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
-                vdst[lane] = std::min(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MIN3_U32::Inst_VOP3__V_MIN3_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_min3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN3_U32
-
-    Inst_VOP3__V_MIN3_U32::~Inst_VOP3__V_MIN3_U32()
-    {
-    } // ~Inst_VOP3__V_MIN3_U32
-
-    // D.u = min(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MIN3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
-                vdst[lane] = std::min(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAX3_F32::Inst_VOP3__V_MAX3_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_max3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MAX3_F32
-
-    Inst_VOP3__V_MAX3_F32::~Inst_VOP3__V_MAX3_F32()
-    {
-    } // ~Inst_VOP3__V_MAX3_F32
-
-    // D.f = max(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MAX3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
-                vdst[lane] = std::fmax(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAX3_I32::Inst_VOP3__V_MAX3_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_max3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX3_I32
-
-    Inst_VOP3__V_MAX3_I32::~Inst_VOP3__V_MAX3_I32()
-    {
-    } // ~Inst_VOP3__V_MAX3_I32
-
-    // D.i = max(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MAX3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
-                vdst[lane] = std::max(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAX3_U32::Inst_VOP3__V_MAX3_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_max3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX3_U32
-
-    Inst_VOP3__V_MAX3_U32::~Inst_VOP3__V_MAX3_U32()
-    {
-    } // ~Inst_VOP3__V_MAX3_U32
-
-    // D.u = max(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MAX3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
-                vdst[lane] = std::max(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MED3_F32::Inst_VOP3__V_MED3_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_med3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MED3_F32
-
-    Inst_VOP3__V_MED3_F32::~Inst_VOP3__V_MED3_F32()
-    {
-    } // ~Inst_VOP3__V_MED3_F32
-
-    // D.f = median(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MED3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MED3_I32::Inst_VOP3__V_MED3_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_med3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MED3_I32
-
-    Inst_VOP3__V_MED3_I32::~Inst_VOP3__V_MED3_I32()
-    {
-    } // ~Inst_VOP3__V_MED3_I32
-
-    // D.i = median(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MED3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MED3_U32::Inst_VOP3__V_MED3_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_med3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MED3_U32
-
-    Inst_VOP3__V_MED3_U32::~Inst_VOP3__V_MED3_U32()
-    {
-    } // ~Inst_VOP3__V_MED3_U32
-
-    // D.u = median(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MED3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_SAD_U8::Inst_VOP3__V_SAD_U8(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sad_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U8
-
-    Inst_VOP3__V_SAD_U8::~Inst_VOP3__V_SAD_U8()
-    {
-    } // ~Inst_VOP3__V_SAD_U8
-
-    // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
-    // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
-    // Sum of absolute differences with accumulation, overflow into upper bits
-    // is allowed.
-    void
-    Inst_VOP3__V_SAD_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(bits(src0[lane], 31, 24)
-                    - bits(src1[lane], 31, 24))
-                    + std::abs(bits(src0[lane], 23, 16)
-                    - bits(src1[lane], 23, 16))
-                    + std::abs(bits(src0[lane], 15, 8)
-                    - bits(src1[lane], 15, 8))
-                    + std::abs(bits(src0[lane], 7, 0)
-                    - bits(src1[lane], 7, 0)) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_SAD_HI_U8::Inst_VOP3__V_SAD_HI_U8(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sad_hi_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_HI_U8
-
-    Inst_VOP3__V_SAD_HI_U8::~Inst_VOP3__V_SAD_HI_U8()
-    {
-    } // ~Inst_VOP3__V_SAD_HI_U8
-
-    // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
-    // Sum of absolute differences with accumulation, overflow is lost.
-    void
-    Inst_VOP3__V_SAD_HI_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (((bits(src0[lane], 31, 24)
-                    - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
-                    - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
-                    - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
-                    - bits(src1[lane], 7, 0))) << 16) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_SAD_U16::Inst_VOP3__V_SAD_U16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sad_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U16
-
-    Inst_VOP3__V_SAD_U16::~Inst_VOP3__V_SAD_U16()
-    {
-    } // ~Inst_VOP3__V_SAD_U16
-
-    // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
-    // + S2.u.
-    // Word SAD with accumulation.
-    void
-    Inst_VOP3__V_SAD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(bits(src0[lane], 31, 16)
-                    - bits(src1[lane], 31, 16))
-                    + std::abs(bits(src0[lane], 15, 0)
-                    - bits(src1[lane], 15, 0)) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_SAD_U32::Inst_VOP3__V_SAD_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_sad_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U32
-
-    Inst_VOP3__V_SAD_U32::~Inst_VOP3__V_SAD_U32()
-    {
-    } // ~Inst_VOP3__V_SAD_U32
-
-    // D.u = abs(S0.i - S1.i) + S2.u.
-    // Dword SAD with accumulation.
-    void
-    Inst_VOP3__V_SAD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_PK_U8_F32::Inst_VOP3__V_CVT_PK_U8_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_pk_u8_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PK_U8_F32
-
-    Inst_VOP3__V_CVT_PK_U8_F32::~Inst_VOP3__V_CVT_PK_U8_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_U8_F32
-
-    // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
-    // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
-    // Convert floating point value S0 to 8-bit unsigned integer and pack the
-    // result into byte S1 of dword S2.
-    void
-    Inst_VOP3__V_CVT_PK_U8_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
-                    << (8 * bits(src1[lane], 1, 0)))
-                    | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_DIV_FIXUP_F32::Inst_VOP3__V_DIV_FIXUP_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_div_fixup_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_DIV_FIXUP_F32
-
-    Inst_VOP3__V_DIV_FIXUP_F32::~Inst_VOP3__V_DIV_FIXUP_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F32
-
-    // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
-    // s2.f = Numerator.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src1[lane]) == FP_ZERO) {
-                    if (std::signbit(src1[lane])) {
-                        vdst[lane] = -INFINITY;
-                    } else {
-                        vdst[lane] = +INFINITY;
-                    }
-                } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src1[lane])) {
-                    if (std::signbit(src1[lane])) {
-                        vdst[lane] = -INFINITY;
-                    } else {
-                        vdst[lane] = +INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src2[lane] / src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---
-
-    Inst_VOP3__V_DIV_FIXUP_F64::Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_div_fixup_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_DIV_FIXUP_F64
-
-    Inst_VOP3__V_DIV_FIXUP_F64::~Inst_VOP3__V_DIV_FIXUP_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F64
-
-    // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
-    // s2.d = Numerator.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int sign_out = std::signbit(src1[lane])
-                              ^ std::signbit(src2[lane]);
-                int exp1(0);
-                int exp2(0);
-                std::frexp(src1[lane], &exp1);
-                std::frexp(src2[lane], &exp2);
-
-                if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
-                    vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
-                } else if (std::fpclassify(src1[lane]) == FP_ZERO
-                           && std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane]
-                        = std::numeric_limits<VecElemF64>::signaling_NaN();
-                } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
-                    vdst[lane]
-                        = std::numeric_limits<VecElemF64>::signaling_NaN();
-                } else if (std::fpclassify(src1[lane]) == FP_ZERO
-                           || std::isinf(src2[lane])) {
-                    vdst[lane] = sign_out ? -INFINITY : +INFINITY;
-                } else if (std::isinf(src1[lane])
-                           || std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane] = sign_out ? -0.0 : +0.0;
-                } else if (exp2 - exp1 < -1075) {
-                    vdst[lane] = src0[lane];
-                } else if (exp1 == 2047) {
-                    vdst[lane] = src0[lane];
-                } else {
-                    vdst[lane] = sign_out ? -std::fabs(src0[lane])
-                        : std::fabs(src0[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_DIV_SCALE_F32::Inst_VOP3__V_DIV_SCALE_F32(
-          InFmt_VOP3_SDST_ENC *iFmt)
-        : Inst_VOP3_SDST_ENC(iFmt, "v_div_scale_f32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(F32);
-    } // Inst_VOP3__V_DIV_SCALE_F32
-
-    Inst_VOP3__V_DIV_SCALE_F32::~Inst_VOP3__V_DIV_SCALE_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_SCALE_F32
-
-    // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
-    // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
-    // numerator and denominator, this opcode will appropriately scale inputs
-    // for division to avoid subnormal terms during Newton-Raphson correction
-    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
-    void
-    Inst_VOP3__V_DIV_SCALE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane];
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---
-
-    Inst_VOP3__V_DIV_SCALE_F64::Inst_VOP3__V_DIV_SCALE_F64(
-          InFmt_VOP3_SDST_ENC *iFmt)
-        : Inst_VOP3_SDST_ENC(iFmt, "v_div_scale_f64")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(F64);
-    } // Inst_VOP3__V_DIV_SCALE_F64
-
-    Inst_VOP3__V_DIV_SCALE_F64::~Inst_VOP3__V_DIV_SCALE_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_SCALE_F64
-
-    // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
-    // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
-    // numerator and denominator, this opcode will appropriately scale inputs
-    // for division to avoid subnormal terms during Newton-Raphson correction
-    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
-    void
-    Inst_VOP3__V_DIV_SCALE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp1(0);
-                int exp2(0);
-                std::frexp(src1[lane], &exp1);
-                std::frexp(src2[lane], &exp2);
-                vcc.setBit(lane, 0);
-
-                if (std::fpclassify(src1[lane]) == FP_ZERO
-                    || std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane] = NAN;
-                } else if (exp2 - exp1 >= 768) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src1[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
-                    vdst[lane] = std::ldexp(src0[lane], 128);
-                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
-                           && std::fpclassify(src2[lane] / src1[lane])
-                           == FP_SUBNORMAL) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src1[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
-                    vdst[lane] = std::ldexp(src0[lane], -128);
-                } else if (std::fpclassify(src2[lane] / src1[lane])
-                           == FP_SUBNORMAL) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src2[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (exp2 <= 53) {
-                    vdst[lane] = std::ldexp(src0[lane], 128);
-                }
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    }
-
-    Inst_VOP3__V_DIV_FMAS_F32::Inst_VOP3__V_DIV_FMAS_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_div_fmas_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-        setFlag(F32);
-        setFlag(FMA);
-    } // Inst_VOP3__V_DIV_FMAS_F32
-
-    Inst_VOP3__V_DIV_FMAS_F32::~Inst_VOP3__V_DIV_FMAS_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_FMAS_F32
-
-    // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
-    // s1.f = Denominator, s2.f = Numerator)
-    void
-    Inst_VOP3__V_DIV_FMAS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        //vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---
-
-    Inst_VOP3__V_DIV_FMAS_F64::Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_div_fmas_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-        setFlag(F64);
-        setFlag(FMA);
-    } // Inst_VOP3__V_DIV_FMAS_F64
-
-    Inst_VOP3__V_DIV_FMAS_F64::~Inst_VOP3__V_DIV_FMAS_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_FMAS_F64
-
-    // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
-    // s1.d = Denominator, s2.d = Numerator)
-    void
-    Inst_VOP3__V_DIV_FMAS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-        vcc.read();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(vcc.rawData(), lane)) {
-                    vdst[lane] = std::pow(2, 64)
-                        * std::fma(src0[lane], src1[lane], src2[lane]);
-                } else {
-                    vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MSAD_U8::Inst_VOP3__V_MSAD_U8(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_msad_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MSAD_U8
-
-    Inst_VOP3__V_MSAD_U8::~Inst_VOP3__V_MSAD_U8()
-    {
-    } // ~Inst_VOP3__V_MSAD_U8
-
-    // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MSAD_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_QSAD_PK_U16_U8::Inst_VOP3__V_QSAD_PK_U16_U8(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_qsad_pk_u16_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_QSAD_PK_U16_U8
-
-    Inst_VOP3__V_QSAD_PK_U16_U8::~Inst_VOP3__V_QSAD_PK_U16_U8()
-    {
-    } // ~Inst_VOP3__V_QSAD_PK_U16_U8
-
-    // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
-    // S1.u[31:0], S2.u[63:0])
-    void
-    Inst_VOP3__V_QSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_MQSAD_PK_U16_U8::Inst_VOP3__V_MQSAD_PK_U16_U8(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mqsad_pk_u16_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MQSAD_PK_U16_U8
-
-    Inst_VOP3__V_MQSAD_PK_U16_U8::~Inst_VOP3__V_MQSAD_PK_U16_U8()
-    {
-    } // ~Inst_VOP3__V_MQSAD_PK_U16_U8
-
-    // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
-    // S1.u[31:0], S2.u[63:0])
-    void
-    Inst_VOP3__V_MQSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_MQSAD_U32_U8::Inst_VOP3__V_MQSAD_U32_U8(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mqsad_u32_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MQSAD_U32_U8
-
-    Inst_VOP3__V_MQSAD_U32_U8::~Inst_VOP3__V_MQSAD_U32_U8()
-    {
-    } // ~Inst_VOP3__V_MQSAD_U32_U8
-
-    // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
-    // S1.u[31:0], S2.u[127:0])
-    void
-    Inst_VOP3__V_MQSAD_U32_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_MAD_U64_U32::Inst_VOP3__V_MAD_U64_U32(
-          InFmt_VOP3_SDST_ENC *iFmt)
-        : Inst_VOP3_SDST_ENC(iFmt, "v_mad_u64_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U64_U32
-
-    Inst_VOP3__V_MAD_U64_U32::~Inst_VOP3__V_MAD_U64_U32()
-    {
-    } // ~Inst_VOP3__V_MAD_U64_U32
-
-    // {vcc_out, D.u64} = S0.u32 * S1.u32 + S2.u64.
-    void
-    Inst_VOP3__V_MAD_U64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-        vdst.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
-                    src2[lane]));
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAD_I64_I32::Inst_VOP3__V_MAD_I64_I32(
-          InFmt_VOP3_SDST_ENC *iFmt)
-        : Inst_VOP3_SDST_ENC(iFmt, "v_mad_i64_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I64_I32
-
-    Inst_VOP3__V_MAD_I64_I32::~Inst_VOP3__V_MAD_I64_I32()
-    {
-    } // ~Inst_VOP3__V_MAD_I64_I32
-
-    // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
-    void
-    Inst_VOP3__V_MAD_I64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandI64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
-                    src2[lane]));
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAD_F16::Inst_VOP3__V_MAD_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mad_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_F16
-
-    Inst_VOP3__V_MAD_F16::~Inst_VOP3__V_MAD_F16()
-    {
-    } // ~Inst_VOP3__V_MAD_F16
-
-    // D.f16 = S0.f16 * S1.f16 + S2.f16.
-    // Supports round mode, exception flags, saturation.
-    void
-    Inst_VOP3__V_MAD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_MAD_U16::Inst_VOP3__V_MAD_U16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mad_u16", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U16
-
-    Inst_VOP3__V_MAD_U16::~Inst_VOP3__V_MAD_U16()
-    {
-    } // ~Inst_VOP3__V_MAD_U16
-
-    // D.u16 = S0.u16 * S1.u16 + S2.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAD_I16::Inst_VOP3__V_MAD_I16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mad_i16", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I16
-
-    Inst_VOP3__V_MAD_I16::~Inst_VOP3__V_MAD_I16()
-    {
-    } // ~Inst_VOP3__V_MAD_I16
-
-    // D.i16 = S0.i16 * S1.i16 + S2.i16.
-    // Supports saturation (signed 16-bit integer domain).
-    void
-    Inst_VOP3__V_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_PERM_B32::Inst_VOP3__V_PERM_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_perm_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_PERM_B32
-
-    Inst_VOP3__V_PERM_B32::~Inst_VOP3__V_PERM_B32()
-    {
-    } // ~Inst_VOP3__V_PERM_B32
-
-    // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
-    // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
-    // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
-    // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
-    // byte permute(byte in[8], byte sel) {
-    //     if(sel>=13) then return 0xff;
-    //     elsif(sel==12) then return 0x00;
-    //     elsif(sel==11) then return in[7][7] * 0xff;
-    //     elsif(sel==10) then return in[5][7] * 0xff;
-    //     elsif(sel==9) then return in[3][7] * 0xff;
-    //     elsif(sel==8) then return in[1][7] * 0xff;
-    //     else return in[sel];
-    //     }
-    void
-    Inst_VOP3__V_PERM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 selector = (VecElemU64)src0[lane];
-                selector = (selector << 32) | (VecElemU64)src1[lane];
-                vdst[lane] = 0;
-
-                DPRINTF(GCN3, "Executing v_perm_b32 src_0 0x%08x, src_1 "
-                        "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
-                        src1[lane], src2[lane], vdst[lane]);
-                DPRINTF(GCN3, "Selector: 0x%08x \n", selector);
-
-                for (int i = 0; i < 4 ; ++i) {
-                    VecElemU32 permuted_val = permute(selector, 0xFF
-                        & ((VecElemU32)src2[lane] >> (8 * i)));
-                    vdst[lane] |= (permuted_val << (8 * i));
-                }
-
-                DPRINTF(GCN3, "v_perm result: 0x%08x\n", vdst[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_FMA_F16::Inst_VOP3__V_FMA_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_fma_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F16
-
-    Inst_VOP3__V_FMA_F16::~Inst_VOP3__V_FMA_F16()
-    {
-    } // ~Inst_VOP3__V_FMA_F16
-
-    // D.f16 = S0.f16 * S1.f16 + S2.f16.
-    // Fused half precision multiply add.
-    void
-    Inst_VOP3__V_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_DIV_FIXUP_F16::Inst_VOP3__V_DIV_FIXUP_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_div_fixup_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_DIV_FIXUP_F16
-
-    Inst_VOP3__V_DIV_FIXUP_F16::~Inst_VOP3__V_DIV_FIXUP_F16()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F16
-
-    // sign_out =  sign(S1.f16)^sign(S2.f16);
-    // if (S2.f16 == NAN)
-    //     D.f16 = Quiet(S2.f16);
-    // else if (S1.f16 == NAN)
-    //     D.f16 = Quiet(S1.f16);
-    // else if (S1.f16 == S2.f16 == 0)
-    //     # 0/0
-    //     D.f16 = pele_nan(0xfe00);
-    // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
-    //     # inf/inf
-    //     D.f16 = pele_nan(0xfe00);
-    // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
-    //     # x/0, or inf/y
-    //     D.f16 = sign_out ? -INF : INF;
-    // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
-    //     # x/inf, 0/y
-    //     D.f16 = sign_out ? -0 : 0;
-    // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
-    //     D.f16 = sign_out ? -underflow : underflow;
-    // else if (exp(S1.f16) == 255)
-    //     D.f16 = sign_out ? -overflow : overflow;
-    // else
-    //     D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
-    // Half precision division fixup.
-    // S0 = Quotient, S1 = Denominator, S3 = Numerator.
-    // Given a numerator, denominator, and quotient from a divide, this opcode
-    // will detect and apply special case numerics, touching up the quotient if
-    // necessary. This opcode also generates invalid, denorm and divide by
-    // zero exceptions caused by the division.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_pkaccum_u8_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKACCUM_U8_F32
-
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::~Inst_VOP3__V_CVT_PKACCUM_U8_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32
-
-    // byte = S1.u[1:0]; bit = byte * 8;
-    // D.u[bit + 7:bit] = flt32_to_uint8(S0.f);
-    // Pack converted value of S0.f into byte S1 of the destination.
-    // SQ translates to V_CVT_PK_U8_F32.
-    // Note: this opcode uses src_c to pass destination in as a source.
-    void
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_INTERP_P1_F32::Inst_VOP3__V_INTERP_P1_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_interp_p1_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_P1_F32
-
-    Inst_VOP3__V_INTERP_P1_F32::~Inst_VOP3__V_INTERP_P1_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1_F32
-
-    // D.f = P10 * S.f + P0;
-    void
-    Inst_VOP3__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_INTERP_P2_F32::Inst_VOP3__V_INTERP_P2_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_interp_p2_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_P2_F32
-
-    Inst_VOP3__V_INTERP_P2_F32::~Inst_VOP3__V_INTERP_P2_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_P2_F32
-
-    // D.f = P20 * S.f + D.f;
-    void
-    Inst_VOP3__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_INTERP_MOV_F32::Inst_VOP3__V_INTERP_MOV_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_interp_mov_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_MOV_F32
-
-    Inst_VOP3__V_INTERP_MOV_F32::~Inst_VOP3__V_INTERP_MOV_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_MOV_F32
-
-    // D.f = {P10,P20,P0}[S.u]; parameter load.
-    void
-    Inst_VOP3__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_INTERP_P1LL_F16::Inst_VOP3__V_INTERP_P1LL_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_interp_p1ll_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P1LL_F16
-
-    Inst_VOP3__V_INTERP_P1LL_F16::~Inst_VOP3__V_INTERP_P1LL_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1LL_F16
-
-    // D.f32 = P10.f16 * S0.f32 + P0.f16.
-    void
-    Inst_VOP3__V_INTERP_P1LL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_INTERP_P1LV_F16::Inst_VOP3__V_INTERP_P1LV_F16(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_interp_p1lv_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P1LV_F16
-
-    Inst_VOP3__V_INTERP_P1LV_F16::~Inst_VOP3__V_INTERP_P1LV_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1LV_F16
-
-    void
-    Inst_VOP3__V_INTERP_P1LV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_INTERP_P2_F16::Inst_VOP3__V_INTERP_P2_F16(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_interp_p2_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P2_F16
-
-    Inst_VOP3__V_INTERP_P2_F16::~Inst_VOP3__V_INTERP_P2_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P2_F16
-
-    // D.f16 = P20.f16 * S0.f32 + S2.f32.
-    void
-    Inst_VOP3__V_INTERP_P2_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_ADD_F64::Inst_VOP3__V_ADD_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_add_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_ADD_F64
-
-    Inst_VOP3__V_ADD_F64::~Inst_VOP3__V_ADD_F64()
-    {
-    } // ~Inst_VOP3__V_ADD_F64
-
-    // D.d = S0.d + S1.d.
-    void
-    Inst_VOP3__V_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane]) ) {
-                        vdst[lane] = NAN;
-                } else if (std::isinf(src0[lane]) &&
-                           std::isinf(src1[lane])) {
-                    if (std::signbit(src0[lane]) !=
-                        std::signbit(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else {
-                        vdst[lane] = src0[lane];
-                    }
-                } else if (std::isinf(src0[lane])) {
-                    vdst[lane] = src0[lane];
-                } else if (std::isinf(src1[lane])) {
-                    vdst[lane] = src1[lane];
-                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        if (std::signbit(src0[lane]) &&
-                            std::signbit(src1[lane])) {
-                            vdst[lane] = -0.0;
-                        } else {
-                            vdst[lane] = 0.0;
-                        }
-                    } else {
-                        vdst[lane] = src1[lane];
-                    }
-                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src1[lane]) == FP_ZERO) {
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src0[lane]) == FP_ZERO) {
-                        if (std::signbit(src0[lane]) &&
-                            std::signbit(src1[lane])) {
-                            vdst[lane] = -0.0;
-                        } else {
-                            vdst[lane] = 0.0;
-                        }
-                    } else {
-                        vdst[lane] = src0[lane];
-                    }
-                } else {
-                    vdst[lane] = src0[lane] + src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MUL_F64::Inst_VOP3__V_MUL_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MUL_F64
-
-    Inst_VOP3__V_MUL_F64::~Inst_VOP3__V_MUL_F64()
-    {
-    } // ~Inst_VOP3__V_MUL_F64
-
-    // D.d = S0.d * S1.d.
-    void
-    Inst_VOP3__V_MUL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MIN_F64::Inst_VOP3__V_MIN_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_min_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MIN_F64
-
-    Inst_VOP3__V_MIN_F64::~Inst_VOP3__V_MIN_F64()
-    {
-    } // ~Inst_VOP3__V_MIN_F64
-
-    // D.d = min(S0.d, S1.d).
-    void
-    Inst_VOP3__V_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MAX_F64::Inst_VOP3__V_MAX_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_max_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MAX_F64
-
-    Inst_VOP3__V_MAX_F64::~Inst_VOP3__V_MAX_F64()
-    {
-    } // ~Inst_VOP3__V_MAX_F64
-
-    // D.d = max(S0.d, S1.d).
-    void
-    Inst_VOP3__V_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_LDEXP_F64::Inst_VOP3__V_LDEXP_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ldexp_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_LDEXP_F64
-
-    Inst_VOP3__V_LDEXP_F64::~Inst_VOP3__V_LDEXP_F64()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F64
-
-    // D.d = pow(S0.d, S1.i[31:0]).
-    void
-    Inst_VOP3__V_LDEXP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
-                    vdst[lane] = src0[lane];
-                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                           || std::fpclassify(src0[lane]) == FP_ZERO) {
-                    if (std::signbit(src0[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = +0.0;
-                    }
-                } else {
-                    vdst[lane] = std::ldexp(src0[lane], src1[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MUL_LO_U32::Inst_VOP3__V_MUL_LO_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_lo_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_LO_U32
-
-    Inst_VOP3__V_MUL_LO_U32::~Inst_VOP3__V_MUL_LO_U32()
-    {
-    } // ~Inst_VOP3__V_MUL_LO_U32
-
-    // D.u = S0.u * S1.u.
-    void
-    Inst_VOP3__V_MUL_LO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MUL_HI_U32::Inst_VOP3__V_MUL_HI_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_hi_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_U32
-
-    Inst_VOP3__V_MUL_HI_U32::~Inst_VOP3__V_MUL_HI_U32()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_U32
-
-    // D.u = (S0.u * S1.u) >> 32.
-    void
-    Inst_VOP3__V_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane]
-                    = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MUL_HI_I32::Inst_VOP3__V_MUL_HI_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mul_hi_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_I32
-
-    Inst_VOP3__V_MUL_HI_I32::~Inst_VOP3__V_MUL_HI_I32()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_I32
-
-    // D.i = (S0.i * S1.i) >> 32.
-    void
-    Inst_VOP3__V_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane]
-                    = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_LDEXP_F32::Inst_VOP3__V_LDEXP_F32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ldexp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LDEXP_F32
-
-    Inst_VOP3__V_LDEXP_F32::~Inst_VOP3__V_LDEXP_F32()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F32
-
-    // D.f = pow(S0.f, S1.i)
-    void
-    Inst_VOP3__V_LDEXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ldexp(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_READLANE_B32::Inst_VOP3__V_READLANE_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_readlane_b32", true)
-    {
-        setFlag(ALU);
-        setFlag(IgnoreExec);
-    } // Inst_VOP3__V_READLANE_B32
-
-    Inst_VOP3__V_READLANE_B32::~Inst_VOP3__V_READLANE_B32()
-    {
-    } // ~Inst_VOP3__V_READLANE_B32
-
-    // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
-    // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_READLANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        sdst = src0[src1.rawData() & 0x3f];
-
-        sdst.write();
-    }
-
-    Inst_VOP3__V_WRITELANE_B32::Inst_VOP3__V_WRITELANE_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_writelane_b32", false)
-    {
-        setFlag(ALU);
-        setFlag(IgnoreExec);
-    } // Inst_VOP3__V_WRITELANE_B32
-
-    Inst_VOP3__V_WRITELANE_B32::~Inst_VOP3__V_WRITELANE_B32()
-    {
-    } // ~Inst_VOP3__V_WRITELANE_B32
-
-    // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
-    // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
-    // exec mask. Input and output modifiers not supported; this is an untyped
-    // operation.
-    void
-    Inst_VOP3__V_WRITELANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.read();
-        src1.read();
-        vdst.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        vdst[src1.rawData() & 0x3f] = src0.rawData();
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_BCNT_U32_B32::Inst_VOP3__V_BCNT_U32_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_bcnt_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BCNT_U32_B32
-
-    Inst_VOP3__V_BCNT_U32_B32::~Inst_VOP3__V_BCNT_U32_B32()
-    {
-    } // ~Inst_VOP3__V_BCNT_U32_B32
-
-    // D.u = CountOneBits(S0.u) + S1.u. Bit count.
-    void
-    Inst_VOP3__V_BCNT_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = popCount(src0[lane]) + src1[lane];
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_MBCNT_LO_U32_B32::Inst_VOP3__V_MBCNT_LO_U32_B32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mbcnt_lo_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MBCNT_LO_U32_B32
-
-    Inst_VOP3__V_MBCNT_LO_U32_B32::~Inst_VOP3__V_MBCNT_LO_U32_B32()
-    {
-    } // ~Inst_VOP3__V_MBCNT_LO_U32_B32
-
-    // Masked bit count, ThreadPosition is the position of this thread in the
-    // wavefront (in 0..63).
-    void
-    Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        uint64_t threadMask = 0;
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
-                vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
-                             src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---
-
-    Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_mbcnt_hi_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MBCNT_HI_U32_B32
-
-    Inst_VOP3__V_MBCNT_HI_U32_B32::~Inst_VOP3__V_MBCNT_HI_U32_B32()
-    {
-    } // ~Inst_VOP3__V_MBCNT_HI_U32_B32
-
-    // ThreadMask = (1 << ThreadPosition) - 1;
-    // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
-    // Masked bit count, ThreadPosition is the position of this thread in the
-    // wavefront (in 0..63).
-    void
-    Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        uint64_t threadMask = 0;
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
-                vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
-                             src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHLREV_B64 class methods ---
-
-    Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_lshlrev_b64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B64
-
-    Inst_VOP3__V_LSHLREV_B64::~Inst_VOP3__V_LSHLREV_B64()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B64
-
-    // D.u64 = S1.u64 << S0.u[5:0].
-    void
-    Inst_VOP3__V_LSHLREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_LSHRREV_B64::Inst_VOP3__V_LSHRREV_B64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_lshrrev_b64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B64
-
-    Inst_VOP3__V_LSHRREV_B64::~Inst_VOP3__V_LSHRREV_B64()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B64
-
-    // D.u64 = S1.u64 >> S0.u[5:0].
-    // The vacated bits are set to zero.
-    void
-    Inst_VOP3__V_LSHRREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_ASHRREV_I64::Inst_VOP3__V_ASHRREV_I64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_ashrrev_i64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I64
-
-    Inst_VOP3__V_ASHRREV_I64::~Inst_VOP3__V_ASHRREV_I64()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I64
-
-    // D.u64 = signext(S1.u64) >> S0.u[5:0].
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_VOP3__V_ASHRREV_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src1[lane] >> bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_TRIG_PREOP_F64::Inst_VOP3__V_TRIG_PREOP_F64(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_trig_preop_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_TRIG_PREOP_F64
-
-    Inst_VOP3__V_TRIG_PREOP_F64::~Inst_VOP3__V_TRIG_PREOP_F64()
-    {
-    } // ~Inst_VOP3__V_TRIG_PREOP_F64
-
-    void
-    Inst_VOP3__V_TRIG_PREOP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_BFM_B32::Inst_VOP3__V_BFM_B32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_bfm_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFM_B32
-
-    Inst_VOP3__V_BFM_B32::~Inst_VOP3__V_BFM_B32()
-    {
-    } // ~Inst_VOP3__V_BFM_B32
-
-    // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0];
-    void
-    Inst_VOP3__V_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
-                    << bits(src1[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    }
-
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::Inst_VOP3__V_CVT_PKNORM_I16_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_pknorm_i16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKNORM_I16_F32
-
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::~Inst_VOP3__V_CVT_PKNORM_I16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32
-
-    // D = {(snorm)S1.f, (snorm)S0.f}.
-    void
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::Inst_VOP3__V_CVT_PKNORM_U16_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_pknorm_u16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKNORM_U16_F32
-
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::~Inst_VOP3__V_CVT_PKNORM_U16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32
-
-    // D = {(unorm)S1.f, (unorm)S0.f}.
-    void
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::Inst_VOP3__V_CVT_PKRTZ_F16_F32(
-          InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_pkrtz_f16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKRTZ_F16_F32
-
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::~Inst_VOP3__V_CVT_PKRTZ_F16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32
-
-    void
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_PK_U16_U32::Inst_VOP3__V_CVT_PK_U16_U32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_pk_u16_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CVT_PK_U16_U32
-
-    Inst_VOP3__V_CVT_PK_U16_U32::~Inst_VOP3__V_CVT_PK_U16_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_U16_U32
-
-    // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
-    void
-    Inst_VOP3__V_CVT_PK_U16_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_VOP3__V_CVT_PK_I16_I32::Inst_VOP3__V_CVT_PK_I16_I32(InFmt_VOP3 *iFmt)
-        : Inst_VOP3(iFmt, "v_cvt_pk_i16_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CVT_PK_I16_I32
-
-    Inst_VOP3__V_CVT_PK_I16_I32::~Inst_VOP3__V_CVT_PK_I16_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_I16_I32
-
-    // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
-    void
-    Inst_VOP3__V_CVT_PK_I16_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_ADD_U32::Inst_DS__DS_ADD_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_u32")
-    {
-    } // Inst_DS__DS_ADD_U32
-
-    Inst_DS__DS_ADD_U32::~Inst_DS__DS_ADD_U32()
-    {
-    } // ~Inst_DS__DS_ADD_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_SUB_U32::Inst_DS__DS_SUB_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_u32")
-    {
-    } // Inst_DS__DS_SUB_U32
-
-    Inst_DS__DS_SUB_U32::~Inst_DS__DS_SUB_U32()
-    {
-    } // ~Inst_DS__DS_SUB_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_RSUB_U32::Inst_DS__DS_RSUB_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_u32")
-    {
-    } // Inst_DS__DS_RSUB_U32
-
-    Inst_DS__DS_RSUB_U32::~Inst_DS__DS_RSUB_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_INC_U32::Inst_DS__DS_INC_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_u32")
-    {
-    } // Inst_DS__DS_INC_U32
-
-    Inst_DS__DS_INC_U32::~Inst_DS__DS_INC_U32()
-    {
-    } // ~Inst_DS__DS_INC_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_INC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_DEC_U32::Inst_DS__DS_DEC_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_u32")
-    {
-    } // Inst_DS__DS_DEC_U32
-
-    Inst_DS__DS_DEC_U32::~Inst_DS__DS_DEC_U32()
-    {
-    } // ~Inst_DS__DS_DEC_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_DEC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_I32::Inst_DS__DS_MIN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_i32")
-    {
-    } // Inst_DS__DS_MIN_I32
-
-    Inst_DS__DS_MIN_I32::~Inst_DS__DS_MIN_I32()
-    {
-    } // ~Inst_DS__DS_MIN_I32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_I32::Inst_DS__DS_MAX_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_i32")
-    {
-    } // Inst_DS__DS_MAX_I32
-
-    Inst_DS__DS_MAX_I32::~Inst_DS__DS_MAX_I32()
-    {
-    } // ~Inst_DS__DS_MAX_I32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_U32::Inst_DS__DS_MIN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_u32")
-    {
-    } // Inst_DS__DS_MIN_U32
-
-    Inst_DS__DS_MIN_U32::~Inst_DS__DS_MIN_U32()
-    {
-    } // ~Inst_DS__DS_MIN_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_U32::Inst_DS__DS_MAX_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_u32")
-    {
-    } // Inst_DS__DS_MAX_U32
-
-    Inst_DS__DS_MAX_U32::~Inst_DS__DS_MAX_U32()
-    {
-    } // ~Inst_DS__DS_MAX_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_AND_B32::Inst_DS__DS_AND_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_b32")
-    {
-    } // Inst_DS__DS_AND_B32
-
-    Inst_DS__DS_AND_B32::~Inst_DS__DS_AND_B32()
-    {
-    } // ~Inst_DS__DS_AND_B32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_b32")
-    {
-    } // Inst_DS__DS_OR_B32
-
-    Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
-    {
-    } // ~Inst_DS__DS_OR_B32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_b32")
-    {
-    } // Inst_DS__DS_XOR_B32
-
-    Inst_DS__DS_XOR_B32::~Inst_DS__DS_XOR_B32()
-    {
-    } // ~Inst_DS__DS_XOR_B32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MSKOR_B32::Inst_DS__DS_MSKOR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_b32")
-    {
-    } // Inst_DS__DS_MSKOR_B32
-
-    Inst_DS__DS_MSKOR_B32::~Inst_DS__DS_MSKOR_B32()
-    {
-    } // ~Inst_DS__DS_MSKOR_B32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MSKOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRITE_B32::Inst_DS__DS_WRITE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B32
-
-    Inst_DS__DS_WRITE_B32::~Inst_DS__DS_WRITE_B32()
-    {
-    } // ~Inst_DS__DS_WRITE_B32
-
-    // MEM[ADDR] = DATA.
-    // Write dword.
-    void
-    Inst_DS__DS_WRITE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_WRITE_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_DS__DS_WRITE2_B32::Inst_DS__DS_WRITE2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2_B32
-
-    Inst_DS__DS_WRITE2_B32::~Inst_DS__DS_WRITE2_B32()
-    {
-    } // ~Inst_DS__DS_WRITE2_B32
-
-    // MEM[ADDR_BASE + OFFSET0 * 4] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 4] = DATA2.
-    // Write 2 dwords.
-    void
-    Inst_DS__DS_WRITE2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_WRITE2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4;
-        Addr offset1 = instData.OFFSET1 * 4;
-
-        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_DS__DS_WRITE2ST64_B32::Inst_DS__DS_WRITE2ST64_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2st64_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2ST64_B32
-
-    Inst_DS__DS_WRITE2ST64_B32::~Inst_DS__DS_WRITE2ST64_B32()
-    {
-    } // ~Inst_DS__DS_WRITE2ST64_B32
-
-    // MEM[ADDR_BASE + OFFSET0 * 4 * 64] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 4 * 64] = DATA2;
-    // Write 2 dwords.
-    void
-    Inst_DS__DS_WRITE2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4 * 64;
-        Addr offset1 = instData.OFFSET1 * 4 * 64;
-
-        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_CMPST_B32 class methods ---
-
-    Inst_DS__DS_CMPST_B32::Inst_DS__DS_CMPST_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_b32")
-    {
-    } // Inst_DS__DS_CMPST_B32
-
-    Inst_DS__DS_CMPST_B32::~Inst_DS__DS_CMPST_B32()
-    {
-    } // ~Inst_DS__DS_CMPST_B32
-
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    void
-    Inst_DS__DS_CMPST_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_CMPST_F32::Inst_DS__DS_CMPST_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_CMPST_F32
-
-    Inst_DS__DS_CMPST_F32::~Inst_DS__DS_CMPST_F32()
-    {
-    } // ~Inst_DS__DS_CMPST_F32
-
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_DS__DS_CMPST_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_F32::Inst_DS__DS_MIN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_F32
-
-    Inst_DS__DS_MIN_F32::~Inst_DS__DS_MIN_F32()
-    {
-    } // ~Inst_DS__DS_MIN_F32
-
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    void
-    Inst_DS__DS_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_F32::Inst_DS__DS_MAX_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_F32
-
-    Inst_DS__DS_MAX_F32::~Inst_DS__DS_MAX_F32()
-    {
-    } // ~Inst_DS__DS_MAX_F32
-
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    void
-    Inst_DS__DS_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_NOP::Inst_DS__DS_NOP(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_nop")
-    {
-        setFlag(Nop);
-    } // Inst_DS__DS_NOP
-
-    Inst_DS__DS_NOP::~Inst_DS__DS_NOP()
-    {
-    } // ~Inst_DS__DS_NOP
-
-    // Do nothing.
-    void
-    Inst_DS__DS_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_DS__DS_ADD_F32::Inst_DS__DS_ADD_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_ADD_F32
-
-    Inst_DS__DS_ADD_F32::~Inst_DS__DS_ADD_F32()
-    {
-    } // ~Inst_DS__DS_ADD_F32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRITE_B8::Inst_DS__DS_WRITE_B8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B8
-
-    Inst_DS__DS_WRITE_B8::~Inst_DS__DS_WRITE_B8()
-    {
-    } // ~Inst_DS__DS_WRITE_B8
-
-    // MEM[ADDR] = DATA[7:0].
-    void
-    Inst_DS__DS_WRITE_B8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B16 class methods ---
-
-    Inst_DS__DS_WRITE_B16::Inst_DS__DS_WRITE_B16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B16
-
-    Inst_DS__DS_WRITE_B16::~Inst_DS__DS_WRITE_B16()
-    {
-    } // ~Inst_DS__DS_WRITE_B16
-
-    // MEM[ADDR] = DATA[15:0]
-    void
-    Inst_DS__DS_WRITE_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU16 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU16>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_ADD_RTN_U32 class methods ---
-
-    Inst_DS__DS_ADD_RTN_U32::Inst_DS__DS_ADD_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_u32")
-    {
-    } // Inst_DS__DS_ADD_RTN_U32
-
-    Inst_DS__DS_ADD_RTN_U32::~Inst_DS__DS_ADD_RTN_U32()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_ADD_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_SUB_RTN_U32::Inst_DS__DS_SUB_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_rtn_u32")
-    {
-    } // Inst_DS__DS_SUB_RTN_U32
-
-    Inst_DS__DS_SUB_RTN_U32::~Inst_DS__DS_SUB_RTN_U32()
-    {
-    } // ~Inst_DS__DS_SUB_RTN_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_SUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_RSUB_RTN_U32::Inst_DS__DS_RSUB_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_rtn_u32")
-    {
-    } // Inst_DS__DS_RSUB_RTN_U32
-
-    Inst_DS__DS_RSUB_RTN_U32::~Inst_DS__DS_RSUB_RTN_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_RTN_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_RSUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_INC_RTN_U32::Inst_DS__DS_INC_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_rtn_u32")
-    {
-    } // Inst_DS__DS_INC_RTN_U32
-
-    Inst_DS__DS_INC_RTN_U32::~Inst_DS__DS_INC_RTN_U32()
-    {
-    } // ~Inst_DS__DS_INC_RTN_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_INC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_DEC_RTN_U32::Inst_DS__DS_DEC_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_rtn_u32")
-    {
-    } // Inst_DS__DS_DEC_RTN_U32
-
-    Inst_DS__DS_DEC_RTN_U32::~Inst_DS__DS_DEC_RTN_U32()
-    {
-    } // ~Inst_DS__DS_DEC_RTN_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_DEC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_RTN_I32::Inst_DS__DS_MIN_RTN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_i32")
-    {
-    } // Inst_DS__DS_MIN_RTN_I32
-
-    Inst_DS__DS_MIN_RTN_I32::~Inst_DS__DS_MIN_RTN_I32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_I32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_RTN_I32::Inst_DS__DS_MAX_RTN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_i32")
-    {
-    } // Inst_DS__DS_MAX_RTN_I32
-
-    Inst_DS__DS_MAX_RTN_I32::~Inst_DS__DS_MAX_RTN_I32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_I32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_RTN_U32::Inst_DS__DS_MIN_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_u32")
-    {
-    } // Inst_DS__DS_MIN_RTN_U32
-
-    Inst_DS__DS_MIN_RTN_U32::~Inst_DS__DS_MIN_RTN_U32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_RTN_U32::Inst_DS__DS_MAX_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_u32")
-    {
-    } // Inst_DS__DS_MAX_RTN_U32
-
-    Inst_DS__DS_MAX_RTN_U32::~Inst_DS__DS_MAX_RTN_U32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_U32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_AND_RTN_B32::Inst_DS__DS_AND_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_rtn_b32")
-    {
-    } // Inst_DS__DS_AND_RTN_B32
-
-    Inst_DS__DS_AND_RTN_B32::~Inst_DS__DS_AND_RTN_B32()
-    {
-    } // ~Inst_DS__DS_AND_RTN_B32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_AND_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_OR_RTN_B32::Inst_DS__DS_OR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_rtn_b32")
-    {
-    } // Inst_DS__DS_OR_RTN_B32
-
-    Inst_DS__DS_OR_RTN_B32::~Inst_DS__DS_OR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_OR_RTN_B32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_OR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_XOR_RTN_B32::Inst_DS__DS_XOR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_rtn_b32")
-    {
-    } // Inst_DS__DS_XOR_RTN_B32
-
-    Inst_DS__DS_XOR_RTN_B32::~Inst_DS__DS_XOR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_XOR_RTN_B32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_XOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MSKOR_RTN_B32::Inst_DS__DS_MSKOR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_rtn_b32")
-    {
-    } // Inst_DS__DS_MSKOR_RTN_B32
-
-    Inst_DS__DS_MSKOR_RTN_B32::~Inst_DS__DS_MSKOR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_MSKOR_RTN_B32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MSKOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRXCHG_RTN_B32::Inst_DS__DS_WRXCHG_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG_RTN_B32
-
-    Inst_DS__DS_WRXCHG_RTN_B32::~Inst_DS__DS_WRXCHG_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG_RTN_B32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    // Write-exchange operation.
-    void
-    Inst_DS__DS_WRXCHG_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRXCHG2_RTN_B32::Inst_DS__DS_WRXCHG2_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG2_RTN_B32
-
-    Inst_DS__DS_WRXCHG2_RTN_B32::~Inst_DS__DS_WRXCHG2_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG2_RTN_B32
-
-    // Write-exchange 2 separate dwords.
-    void
-    Inst_DS__DS_WRXCHG2_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::Inst_DS__DS_WRXCHG2ST64_RTN_B32(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG2ST64_RTN_B32
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::~Inst_DS__DS_WRXCHG2ST64_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B32
-
-    // Write-exchange 2 separate dwords with a stride of 64 dwords.
-    void
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_CMPST_RTN_B32::Inst_DS__DS_CMPST_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_b32")
-    {
-    } // Inst_DS__DS_CMPST_RTN_B32
-
-    Inst_DS__DS_CMPST_RTN_B32::~Inst_DS__DS_CMPST_RTN_B32()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_B32
-
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    void
-    Inst_DS__DS_CMPST_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_CMPST_RTN_F32::Inst_DS__DS_CMPST_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_CMPST_RTN_F32
-
-    Inst_DS__DS_CMPST_RTN_F32::~Inst_DS__DS_CMPST_RTN_F32()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_F32
-
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_DS__DS_CMPST_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_RTN_F32::Inst_DS__DS_MIN_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_RTN_F32
-
-    Inst_DS__DS_MIN_RTN_F32::~Inst_DS__DS_MIN_RTN_F32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_F32
-
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    void
-    Inst_DS__DS_MIN_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_RTN_F32::Inst_DS__DS_MAX_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_RTN_F32
-
-    Inst_DS__DS_MAX_RTN_F32::~Inst_DS__DS_MAX_RTN_F32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_F32
-
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    void
-    Inst_DS__DS_MAX_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRAP_RTN_B32::Inst_DS__DS_WRAP_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrap_rtn_b32")
-    {
-    } // Inst_DS__DS_WRAP_RTN_B32
-
-    Inst_DS__DS_WRAP_RTN_B32::~Inst_DS__DS_WRAP_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRAP_RTN_B32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? tmp - DATA : tmp + DATA2;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_WRAP_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_ADD_RTN_F32::Inst_DS__DS_ADD_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_ADD_RTN_F32
-
-    Inst_DS__DS_ADD_RTN_F32::~Inst_DS__DS_ADD_RTN_F32()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_F32
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_ADD_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_DS__DS_READ_B32::Inst_DS__DS_READ_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B32
-
-    Inst_DS__DS_READ_B32::~Inst_DS__DS_READ_B32()
-    {
-    } // ~Inst_DS__DS_READ_B32
-
-    // RETURN_DATA = MEM[ADDR].
-    // Dword read.
-    void
-    Inst_DS__DS_READ_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_READ_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-
-    Inst_DS__DS_READ2_B32::Inst_DS__DS_READ2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2_B32
-
-    Inst_DS__DS_READ2_B32::~Inst_DS__DS_READ2_B32()
-    {
-    } // ~Inst_DS__DS_READ2_B32
-
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4].
-    // Read 2 dwords.
-    void
-    Inst_DS__DS_READ2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_READ2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4;
-        Addr offset1 = instData.OFFSET1 * 4;
-
-        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-
-    Inst_DS__DS_READ2ST64_B32::Inst_DS__DS_READ2ST64_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2st64_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2ST64_B32
-
-    Inst_DS__DS_READ2ST64_B32::~Inst_DS__DS_READ2ST64_B32()
-    {
-    } // ~Inst_DS__DS_READ2ST64_B32
-
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4 * 64];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4 * 64].
-    // Read 2 dwords.
-    void
-    Inst_DS__DS_READ2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = (instData.OFFSET0 * 4 * 64);
-        Addr offset1 = (instData.OFFSET1 * 4 * 64);
-
-        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_READ2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    }
-    // --- Inst_DS__DS_READ_I8 class methods ---
-
-    Inst_DS__DS_READ_I8::Inst_DS__DS_READ_I8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_i8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_I8
-
-    Inst_DS__DS_READ_I8::~Inst_DS__DS_READ_I8()
-    {
-    } // ~Inst_DS__DS_READ_I8
-
-    // RETURN_DATA = signext(MEM[ADDR][7:0]).
-    // Signed byte read.
-    void
-    Inst_DS__DS_READ_I8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_READ_U8::Inst_DS__DS_READ_U8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_u8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_U8
-
-    Inst_DS__DS_READ_U8::~Inst_DS__DS_READ_U8()
-    {
-    } // ~Inst_DS__DS_READ_U8
-
-    // RETURN_DATA = {24'h0,MEM[ADDR][7:0]}.
-    // Unsigned byte read.
-    void
-    Inst_DS__DS_READ_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_U8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU8*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ_I16 class methods ---
-
-    Inst_DS__DS_READ_I16::Inst_DS__DS_READ_I16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_i16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_I16
-
-    Inst_DS__DS_READ_I16::~Inst_DS__DS_READ_I16()
-    {
-    } // ~Inst_DS__DS_READ_I16
-
-    // RETURN_DATA = signext(MEM[ADDR][15:0]).
-    // Signed short read.
-    void
-    Inst_DS__DS_READ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_READ_U16::Inst_DS__DS_READ_U16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_u16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_U16
-
-    Inst_DS__DS_READ_U16::~Inst_DS__DS_READ_U16()
-    {
-    } // ~Inst_DS__DS_READ_U16
-
-    // RETURN_DATA = {16'h0,MEM[ADDR][15:0]}.
-    // Unsigned short read.
-    void
-    Inst_DS__DS_READ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-    void
-    Inst_DS__DS_READ_U16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU16>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU16*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_SWIZZLE_B32 class methods ---
-
-    Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_swizzle_b32")
-    {
-         setFlag(Load);
-    } // Inst_DS__DS_SWIZZLE_B32
-
-    Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
-    {
-    } // ~Inst_DS__DS_SWIZZLE_B32
-
-    // RETURN_DATA = swizzle(vgpr_data, offset1:offset0).
-    // Dword swizzle, no data is written to LDS memory;
-    void
-    Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        wf->rdLmReqsInPipe--;
-        wf->validateRequestCounters();
-
-        if (gpuDynInst->exec_mask.none()) {
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-        /**
-         * The "DS pattern" is comprised of both offset fields. That is, the
-         * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
-         * which swizzle mode to use. There are two different swizzle
-         * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
-         * QDMode else use Bit-masks mode. The remaining bits dictate how to
-         * swizzle the lanes.
-         *
-         * QDMode:      Chunks the lanes into 4s and swizzles among them.
-         *              Bits 7:6 dictate where lane 3 (of the current chunk)
-         *              gets its date, 5:4 lane 2, etc.
-         *
-         * Bit-mask:    This mode breaks bits 14:0 into 3 equal-sized chunks.
-         *              14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
-         *              is the and_mask. Each lane is swizzled by performing
-         *              the appropriate operation using these masks.
-         */
-        VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0);
-
-        data.read();
-
-        if (bits(ds_pattern, 15)) {
-            // QDMode
-            for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
-                /**
-                 * This operation allows data sharing between groups
-                 * of four consecutive threads. Note the increment by
-                 * 4 in the for loop.
-                 */
-                if (gpuDynInst->exec_mask[lane]) {
-                    int index0 = lane + bits(ds_pattern, 1, 0);
-                    panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index0);
-                    vdst[lane]
-                        = gpuDynInst->exec_mask[index0] ? data[index0]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 1]) {
-                    int index1 = lane + bits(ds_pattern, 3, 2);
-                    panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index1);
-                    vdst[lane + 1]
-                        = gpuDynInst->exec_mask[index1] ? data[index1]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 2]) {
-                    int index2 = lane + bits(ds_pattern, 5, 4);
-                    panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index2);
-                    vdst[lane + 2]
-                        = gpuDynInst->exec_mask[index2] ? data[index2]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 3]) {
-                    int index3 = lane + bits(ds_pattern, 7, 6);
-                    panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index3);
-                    vdst[lane + 3]
-                        = gpuDynInst->exec_mask[index3] ? data[index3]: 0;
-                }
-            }
-        } else {
-            // Bit Mode
-            int and_mask = bits(ds_pattern, 4, 0);
-            int or_mask = bits(ds_pattern, 9, 5);
-            int xor_mask = bits(ds_pattern, 14, 10);
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    int index = (((lane & and_mask) | or_mask) ^ xor_mask);
-                    // Adjust for the next 32 lanes.
-                    if (lane > 31) {
-                        index += 32;
-                    }
-                    panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is "
-                             "out of bounds.\n", gpuDynInst->disassemble(),
-                             index);
-                    vdst[lane]
-                        = gpuDynInst->exec_mask[index] ? data[index] : 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a  possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-    } // execute
-    // --- Inst_DS__DS_PERMUTE_B32 class methods ---
-
-    Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_permute_b32")
-    {
-        setFlag(MemoryRef);
-        /**
-         * While this operation doesn't actually use DS storage we classify
-         * it as a load here because it does a writeback to a VGPR, which
-         * fits in better with the LDS pipeline logic.
-         */
-         setFlag(Load);
-    } // Inst_DS__DS_PERMUTE_B32
-
-    Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32()
-    {
-    } // ~Inst_DS__DS_PERMUTE_B32
-
-    // Forward permute.
-    void
-    Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        addr.read();
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                /**
-                 * One of the offset fields can be used for the index.
-                 * It is assumed OFFSET0 would be used, as OFFSET1 is
-                 * typically only used for DS ops that operate on two
-                 * disparate pieces of data.
-                 */
-                assert(!instData.OFFSET1);
-                /**
-                 * The address provided is a byte address, but VGPRs are
-                 * 4 bytes, so we must divide by 4 to get the actual VGPR
-                 * index. Additionally, the index is calculated modulo the
-                 * WF size, 64 in this case, so we simply extract bits 7-2.
-                 */
-                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
-                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
-                         "of bounds.\n", gpuDynInst->disassemble(), index);
-                /**
-                 * If the shuffled index corresponds to a lane that is
-                 * inactive then this instruction writes a 0 to the active
-                 * lane in VDST.
-                 */
-                if (wf->execMask(index)) {
-                    vdst[index] = data[lane];
-                } else {
-                    vdst[index] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        wf->decLGKMInstsIssued();
-        wf->rdLmReqsInPipe--;
-        wf->validateRequestCounters();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a  possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-    } // execute
-    // --- Inst_DS__DS_BPERMUTE_B32 class methods ---
-
-    Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_bpermute_b32")
-    {
-        setFlag(MemoryRef);
-        /**
-         * While this operation doesn't actually use DS storage we classify
-         * it as a load here because it does a writeback to a VGPR, which
-         * fits in better with the LDS pipeline logic.
-         */
-        setFlag(Load);
-    } // Inst_DS__DS_BPERMUTE_B32
-
-    Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32()
-    {
-    } // ~Inst_DS__DS_BPERMUTE_B32
-
-    // Backward permute.
-    void
-    Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        addr.read();
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                /**
-                 * One of the offset fields can be used for the index.
-                 * It is assumed OFFSET0 would be used, as OFFSET1 is
-                 * typically only used for DS ops that operate on two
-                 * disparate pieces of data.
-                 */
-                assert(!instData.OFFSET1);
-                /**
-                 * The address provided is a byte address, but VGPRs are
-                 * 4 bytes, so we must divide by 4 to get the actual VGPR
-                 * index. Additionally, the index is calculated modulo the
-                 * WF size, 64 in this case, so we simply extract bits 7-2.
-                 */
-                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
-                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
-                         "of bounds.\n", gpuDynInst->disassemble(), index);
-                /**
-                 * If the shuffled index corresponds to a lane that is
-                 * inactive then this instruction writes a 0 to the active
-                 * lane in VDST.
-                 */
-                if (wf->execMask(index)) {
-                    vdst[lane] = data[index];
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        wf->decLGKMInstsIssued();
-        wf->rdLmReqsInPipe--;
-        wf->validateRequestCounters();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-    } // execute
-
-    // --- Inst_DS__DS_ADD_U64 class methods ---
-
-    Inst_DS__DS_ADD_U64::Inst_DS__DS_ADD_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_u64")
-    {
-    } // Inst_DS__DS_ADD_U64
-
-    Inst_DS__DS_ADD_U64::~Inst_DS__DS_ADD_U64()
-    {
-    } // ~Inst_DS__DS_ADD_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_SUB_U64::Inst_DS__DS_SUB_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_u64")
-    {
-    } // Inst_DS__DS_SUB_U64
-
-    Inst_DS__DS_SUB_U64::~Inst_DS__DS_SUB_U64()
-    {
-    } // ~Inst_DS__DS_SUB_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_SUB_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_RSUB_U64::Inst_DS__DS_RSUB_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_u64")
-    {
-    } // Inst_DS__DS_RSUB_U64
-
-    Inst_DS__DS_RSUB_U64::~Inst_DS__DS_RSUB_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_INC_U64::Inst_DS__DS_INC_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_u64")
-    {
-    } // Inst_DS__DS_INC_U64
-
-    Inst_DS__DS_INC_U64::~Inst_DS__DS_INC_U64()
-    {
-    } // ~Inst_DS__DS_INC_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_INC_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_DEC_U64::Inst_DS__DS_DEC_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_u64")
-    {
-    } // Inst_DS__DS_DEC_U64
-
-    Inst_DS__DS_DEC_U64::~Inst_DS__DS_DEC_U64()
-    {
-    } // ~Inst_DS__DS_DEC_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_DEC_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_I64::Inst_DS__DS_MIN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_i64")
-    {
-    } // Inst_DS__DS_MIN_I64
-
-    Inst_DS__DS_MIN_I64::~Inst_DS__DS_MIN_I64()
-    {
-    } // ~Inst_DS__DS_MIN_I64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_I64::Inst_DS__DS_MAX_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_i64")
-    {
-    } // Inst_DS__DS_MAX_I64
-
-    Inst_DS__DS_MAX_I64::~Inst_DS__DS_MAX_I64()
-    {
-    } // ~Inst_DS__DS_MAX_I64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_U64::Inst_DS__DS_MIN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_u64")
-    {
-    } // Inst_DS__DS_MIN_U64
-
-    Inst_DS__DS_MIN_U64::~Inst_DS__DS_MIN_U64()
-    {
-    } // ~Inst_DS__DS_MIN_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_U64::Inst_DS__DS_MAX_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_u64")
-    {
-    } // Inst_DS__DS_MAX_U64
-
-    Inst_DS__DS_MAX_U64::~Inst_DS__DS_MAX_U64()
-    {
-    } // ~Inst_DS__DS_MAX_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_AND_B64::Inst_DS__DS_AND_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_b64")
-    {
-    } // Inst_DS__DS_AND_B64
-
-    Inst_DS__DS_AND_B64::~Inst_DS__DS_AND_B64()
-    {
-    } // ~Inst_DS__DS_AND_B64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_AND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_OR_B64::Inst_DS__DS_OR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_b64")
-    {
-    } // Inst_DS__DS_OR_B64
-
-    Inst_DS__DS_OR_B64::~Inst_DS__DS_OR_B64()
-    {
-    } // ~Inst_DS__DS_OR_B64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_OR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_XOR_B64::Inst_DS__DS_XOR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_b64")
-    {
-    } // Inst_DS__DS_XOR_B64
-
-    Inst_DS__DS_XOR_B64::~Inst_DS__DS_XOR_B64()
-    {
-    } // ~Inst_DS__DS_XOR_B64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MSKOR_B64::Inst_DS__DS_MSKOR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_b64")
-    {
-    } // Inst_DS__DS_MSKOR_B64
-
-    Inst_DS__DS_MSKOR_B64::~Inst_DS__DS_MSKOR_B64()
-    {
-    } // ~Inst_DS__DS_MSKOR_B64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MSKOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRITE_B64::Inst_DS__DS_WRITE_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B64
-
-    Inst_DS__DS_WRITE_B64::~Inst_DS__DS_WRITE_B64()
-    {
-    } // ~Inst_DS__DS_WRITE_B64
-
-    // MEM[ADDR] = DATA.
-    // Write qword.
-    void
-    Inst_DS__DS_WRITE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_WRITE_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU64>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_DS__DS_WRITE2_B64::Inst_DS__DS_WRITE2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2_B64
-
-    Inst_DS__DS_WRITE2_B64::~Inst_DS__DS_WRITE2_B64()
-    {
-    } // ~Inst_DS__DS_WRITE2_B64
-
-    // MEM[ADDR_BASE + OFFSET0 * 8] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 8] = DATA2.
-    // Write 2 qwords.
-    void
-    Inst_DS__DS_WRITE2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2] = data0[lane];
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_WRITE2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 8;
-        Addr offset1 = instData.OFFSET1 * 8;
-
-        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_DS__DS_WRITE2ST64_B64::Inst_DS__DS_WRITE2ST64_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2st64_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2ST64_B64
-
-    Inst_DS__DS_WRITE2ST64_B64::~Inst_DS__DS_WRITE2ST64_B64()
-    {
-    } // ~Inst_DS__DS_WRITE2ST64_B64
-
-    // MEM[ADDR_BASE + OFFSET0 * 8 * 64] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 8 * 64] = DATA2;
-    // Write 2 qwords.
-    void
-    Inst_DS__DS_WRITE2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_CMPST_B64::Inst_DS__DS_CMPST_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_b64")
-    {
-    } // Inst_DS__DS_CMPST_B64
-
-    Inst_DS__DS_CMPST_B64::~Inst_DS__DS_CMPST_B64()
-    {
-    } // ~Inst_DS__DS_CMPST_B64
-
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    void
-    Inst_DS__DS_CMPST_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_CMPST_F64::Inst_DS__DS_CMPST_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_CMPST_F64
-
-    Inst_DS__DS_CMPST_F64::~Inst_DS__DS_CMPST_F64()
-    {
-    } // ~Inst_DS__DS_CMPST_F64
-
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_DS__DS_CMPST_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_F64::Inst_DS__DS_MIN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_F64
-
-    Inst_DS__DS_MIN_F64::~Inst_DS__DS_MIN_F64()
-    {
-    } // ~Inst_DS__DS_MIN_F64
-
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    void
-    Inst_DS__DS_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_F64::Inst_DS__DS_MAX_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_F64
-
-    Inst_DS__DS_MAX_F64::~Inst_DS__DS_MAX_F64()
-    {
-    } // ~Inst_DS__DS_MAX_F64
-
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    void
-    Inst_DS__DS_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_ADD_RTN_U64::Inst_DS__DS_ADD_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_u64")
-    {
-    } // Inst_DS__DS_ADD_RTN_U64
-
-    Inst_DS__DS_ADD_RTN_U64::~Inst_DS__DS_ADD_RTN_U64()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_ADD_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_SUB_RTN_U64::Inst_DS__DS_SUB_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_rtn_u64")
-    {
-    } // Inst_DS__DS_SUB_RTN_U64
-
-    Inst_DS__DS_SUB_RTN_U64::~Inst_DS__DS_SUB_RTN_U64()
-    {
-    } // ~Inst_DS__DS_SUB_RTN_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_SUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_RSUB_RTN_U64::Inst_DS__DS_RSUB_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_rtn_u64")
-    {
-    } // Inst_DS__DS_RSUB_RTN_U64
-
-    Inst_DS__DS_RSUB_RTN_U64::~Inst_DS__DS_RSUB_RTN_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_RTN_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_INC_RTN_U64::Inst_DS__DS_INC_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_rtn_u64")
-    {
-    } // Inst_DS__DS_INC_RTN_U64
-
-    Inst_DS__DS_INC_RTN_U64::~Inst_DS__DS_INC_RTN_U64()
-    {
-    } // ~Inst_DS__DS_INC_RTN_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_INC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_DEC_RTN_U64::Inst_DS__DS_DEC_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_rtn_u64")
-    {
-    } // Inst_DS__DS_DEC_RTN_U64
-
-    Inst_DS__DS_DEC_RTN_U64::~Inst_DS__DS_DEC_RTN_U64()
-    {
-    } // ~Inst_DS__DS_DEC_RTN_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_DEC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_RTN_I64::Inst_DS__DS_MIN_RTN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_i64")
-    {
-    } // Inst_DS__DS_MIN_RTN_I64
-
-    Inst_DS__DS_MIN_RTN_I64::~Inst_DS__DS_MIN_RTN_I64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_I64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_RTN_I64::Inst_DS__DS_MAX_RTN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_i64")
-    {
-    } // Inst_DS__DS_MAX_RTN_I64
-
-    Inst_DS__DS_MAX_RTN_I64::~Inst_DS__DS_MAX_RTN_I64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_I64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_RTN_U64::Inst_DS__DS_MIN_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_u64")
-    {
-    } // Inst_DS__DS_MIN_RTN_U64
-
-    Inst_DS__DS_MIN_RTN_U64::~Inst_DS__DS_MIN_RTN_U64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_RTN_U64::Inst_DS__DS_MAX_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_u64")
-    {
-    } // Inst_DS__DS_MAX_RTN_U64
-
-    Inst_DS__DS_MAX_RTN_U64::~Inst_DS__DS_MAX_RTN_U64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_U64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_AND_RTN_B64::Inst_DS__DS_AND_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_rtn_b64")
-    {
-    } // Inst_DS__DS_AND_RTN_B64
-
-    Inst_DS__DS_AND_RTN_B64::~Inst_DS__DS_AND_RTN_B64()
-    {
-    } // ~Inst_DS__DS_AND_RTN_B64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_AND_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_OR_RTN_B64::Inst_DS__DS_OR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_rtn_b64")
-    {
-    } // Inst_DS__DS_OR_RTN_B64
-
-    Inst_DS__DS_OR_RTN_B64::~Inst_DS__DS_OR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_OR_RTN_B64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_OR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_XOR_RTN_B64::Inst_DS__DS_XOR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_rtn_b64")
-    {
-    } // Inst_DS__DS_XOR_RTN_B64
-
-    Inst_DS__DS_XOR_RTN_B64::~Inst_DS__DS_XOR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_XOR_RTN_B64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_XOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MSKOR_RTN_B64::Inst_DS__DS_MSKOR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_rtn_b64")
-    {
-    } // Inst_DS__DS_MSKOR_RTN_B64
-
-    Inst_DS__DS_MSKOR_RTN_B64::~Inst_DS__DS_MSKOR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_MSKOR_RTN_B64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRXCHG_RTN_B64::Inst_DS__DS_WRXCHG_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG_RTN_B64
-
-    Inst_DS__DS_WRXCHG_RTN_B64::~Inst_DS__DS_WRXCHG_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG_RTN_B64
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    // Write-exchange operation.
-    void
-    Inst_DS__DS_WRXCHG_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRXCHG2_RTN_B64::Inst_DS__DS_WRXCHG2_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG2_RTN_B64
-
-    Inst_DS__DS_WRXCHG2_RTN_B64::~Inst_DS__DS_WRXCHG2_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG2_RTN_B64
-
-    // Write-exchange 2 separate qwords.
-    void
-    Inst_DS__DS_WRXCHG2_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::Inst_DS__DS_WRXCHG2ST64_RTN_B64(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG2ST64_RTN_B64
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::~Inst_DS__DS_WRXCHG2ST64_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B64
-
-    // Write-exchange 2 qwords with a stride of 64 qwords.
-    void
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_CMPST_RTN_B64::Inst_DS__DS_CMPST_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_b64")
-    {
-    } // Inst_DS__DS_CMPST_RTN_B64
-
-    Inst_DS__DS_CMPST_RTN_B64::~Inst_DS__DS_CMPST_RTN_B64()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_B64
-
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    void
-    Inst_DS__DS_CMPST_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_CMPST_RTN_F64::Inst_DS__DS_CMPST_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_CMPST_RTN_F64
-
-    Inst_DS__DS_CMPST_RTN_F64::~Inst_DS__DS_CMPST_RTN_F64()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_F64
-
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_DS__DS_CMPST_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_RTN_F64::Inst_DS__DS_MIN_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_RTN_F64
-
-    Inst_DS__DS_MIN_RTN_F64::~Inst_DS__DS_MIN_RTN_F64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_F64
-
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    void
-    Inst_DS__DS_MIN_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_RTN_F64::Inst_DS__DS_MAX_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_RTN_F64
-
-    Inst_DS__DS_MAX_RTN_F64::~Inst_DS__DS_MAX_RTN_F64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_F64
-
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    void
-    Inst_DS__DS_MAX_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_READ_B64::Inst_DS__DS_READ_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B64
-
-    Inst_DS__DS_READ_B64::~Inst_DS__DS_READ_B64()
-    {
-    } // ~Inst_DS__DS_READ_B64
-
-    // RETURN_DATA = MEM[ADDR].
-    // Read 1 qword.
-    void
-    Inst_DS__DS_READ_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_READ_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU64>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-
-    Inst_DS__DS_READ2_B64::Inst_DS__DS_READ2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2_B64
-
-    Inst_DS__DS_READ2_B64::~Inst_DS__DS_READ2_B64()
-    {
-    } // ~Inst_DS__DS_READ2_B64
-
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8].
-    // Read 2 qwords.
-    void
-    Inst_DS__DS_READ2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_READ2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 8;
-        Addr offset1 = instData.OFFSET1 * 8;
-
-        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-
-    Inst_DS__DS_READ2ST64_B64::Inst_DS__DS_READ2ST64_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2st64_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2ST64_B64
-
-    Inst_DS__DS_READ2ST64_B64::~Inst_DS__DS_READ2ST64_B64()
-    {
-    } // ~Inst_DS__DS_READ2ST64_B64
-
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8 * 64];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8 * 64].
-    // Read 2 qwords.
-    void
-    Inst_DS__DS_READ2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_READ2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = (instData.OFFSET0 * 8 * 64);
-        Addr offset1 = (instData.OFFSET1 * 8 * 64);
-
-        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_READ2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    }
-
-    Inst_DS__DS_CONDXCHG32_RTN_B64::Inst_DS__DS_CONDXCHG32_RTN_B64(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_condxchg32_rtn_b64")
-    {
-    } // Inst_DS__DS_CONDXCHG32_RTN_B64
-
-    Inst_DS__DS_CONDXCHG32_RTN_B64::~Inst_DS__DS_CONDXCHG32_RTN_B64()
-    {
-    } // ~Inst_DS__DS_CONDXCHG32_RTN_B64
-
-    // Conditional write exchange.
-    void
-    Inst_DS__DS_CONDXCHG32_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_ADD_SRC2_U32::Inst_DS__DS_ADD_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_u32")
-    {
-    } // Inst_DS__DS_ADD_SRC2_U32
-
-    Inst_DS__DS_ADD_SRC2_U32::~Inst_DS__DS_ADD_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_U32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] + MEM[B].
-    void
-    Inst_DS__DS_ADD_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_SUB_SRC2_U32::Inst_DS__DS_SUB_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_src2_u32")
-    {
-    } // Inst_DS__DS_SUB_SRC2_U32
-
-    Inst_DS__DS_SUB_SRC2_U32::~Inst_DS__DS_SUB_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_SUB_SRC2_U32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] - MEM[B].
-    void
-    Inst_DS__DS_SUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_RSUB_SRC2_U32::Inst_DS__DS_RSUB_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_src2_u32")
-    {
-    } // Inst_DS__DS_RSUB_SRC2_U32
-
-    Inst_DS__DS_RSUB_SRC2_U32::~Inst_DS__DS_RSUB_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_SRC2_U32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] - MEM[A].
-    void
-    Inst_DS__DS_RSUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_INC_SRC2_U32::Inst_DS__DS_INC_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_src2_u32")
-    {
-    } // Inst_DS__DS_INC_SRC2_U32
-
-    Inst_DS__DS_INC_SRC2_U32::~Inst_DS__DS_INC_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_INC_SRC2_U32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
-    void
-    Inst_DS__DS_INC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_DEC_SRC2_U32::Inst_DS__DS_DEC_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_src2_u32")
-    {
-    } // Inst_DS__DS_DEC_SRC2_U32
-
-    Inst_DS__DS_DEC_SRC2_U32::~Inst_DS__DS_DEC_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_DEC_SRC2_U32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
-    // Uint decrement.
-    void
-    Inst_DS__DS_DEC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_SRC2_I32::Inst_DS__DS_MIN_SRC2_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_i32")
-    {
-    } // Inst_DS__DS_MIN_SRC2_I32
-
-    Inst_DS__DS_MIN_SRC2_I32::~Inst_DS__DS_MIN_SRC2_I32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_I32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_SRC2_I32::Inst_DS__DS_MAX_SRC2_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_i32")
-    {
-    } // Inst_DS__DS_MAX_SRC2_I32
-
-    Inst_DS__DS_MAX_SRC2_I32::~Inst_DS__DS_MAX_SRC2_I32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_I32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_SRC2_U32::Inst_DS__DS_MIN_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_u32")
-    {
-    } // Inst_DS__DS_MIN_SRC2_U32
-
-    Inst_DS__DS_MIN_SRC2_U32::~Inst_DS__DS_MIN_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_U32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_SRC2_U32::Inst_DS__DS_MAX_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_u32")
-    {
-    } // Inst_DS__DS_MAX_SRC2_U32
-
-    Inst_DS__DS_MAX_SRC2_U32::~Inst_DS__DS_MAX_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_U32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_AND_SRC2_B32::Inst_DS__DS_AND_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_src2_b32")
-    {
-    } // Inst_DS__DS_AND_SRC2_B32
-
-    Inst_DS__DS_AND_SRC2_B32::~Inst_DS__DS_AND_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_AND_SRC2_B32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] & MEM[B].
-    void
-    Inst_DS__DS_AND_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_OR_SRC2_B32::Inst_DS__DS_OR_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_src2_b32")
-    {
-    } // Inst_DS__DS_OR_SRC2_B32
-
-    Inst_DS__DS_OR_SRC2_B32::~Inst_DS__DS_OR_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_OR_SRC2_B32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] | MEM[B].
-    void
-    Inst_DS__DS_OR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_XOR_SRC2_B32::Inst_DS__DS_XOR_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_src2_b32")
-    {
-    } // Inst_DS__DS_XOR_SRC2_B32
-
-    Inst_DS__DS_XOR_SRC2_B32::~Inst_DS__DS_XOR_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_XOR_SRC2_B32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] ^ MEM[B].
-    void
-    Inst_DS__DS_XOR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRITE_SRC2_B32::Inst_DS__DS_WRITE_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_src2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_SRC2_B32
-
-    Inst_DS__DS_WRITE_SRC2_B32::~Inst_DS__DS_WRITE_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_WRITE_SRC2_B32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B].
-    // Write dword.
-    void
-    Inst_DS__DS_WRITE_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_SRC2_F32::Inst_DS__DS_MIN_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_SRC2_F32
-
-    Inst_DS__DS_MIN_SRC2_F32::~Inst_DS__DS_MIN_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_F32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
-    void
-    Inst_DS__DS_MIN_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_SRC2_F32::Inst_DS__DS_MAX_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_SRC2_F32
-
-    Inst_DS__DS_MAX_SRC2_F32::~Inst_DS__DS_MAX_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_F32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
-    void
-    Inst_DS__DS_MAX_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_ADD_SRC2_F32::Inst_DS__DS_ADD_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_ADD_SRC2_F32
-
-    Inst_DS__DS_ADD_SRC2_F32::~Inst_DS__DS_ADD_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_F32
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] + MEM[A].
-    void
-    Inst_DS__DS_ADD_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::Inst_DS__DS_GWS_SEMA_RELEASE_ALL(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_release_all")
-    {
-    } // Inst_DS__DS_GWS_SEMA_RELEASE_ALL
-
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::~Inst_DS__DS_GWS_SEMA_RELEASE_ALL()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_RELEASE_ALL
-
-    void
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_GWS_INIT::Inst_DS__DS_GWS_INIT(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_init")
-    {
-    } // Inst_DS__DS_GWS_INIT
-
-    Inst_DS__DS_GWS_INIT::~Inst_DS__DS_GWS_INIT()
-    {
-    } // ~Inst_DS__DS_GWS_INIT
-
-    void
-    Inst_DS__DS_GWS_INIT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_GWS_SEMA_V::Inst_DS__DS_GWS_SEMA_V(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_v")
-    {
-    } // Inst_DS__DS_GWS_SEMA_V
-
-    Inst_DS__DS_GWS_SEMA_V::~Inst_DS__DS_GWS_SEMA_V()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_V
-
-    void
-    Inst_DS__DS_GWS_SEMA_V::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_GWS_SEMA_BR::Inst_DS__DS_GWS_SEMA_BR(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_br")
-    {
-    } // Inst_DS__DS_GWS_SEMA_BR
-
-    Inst_DS__DS_GWS_SEMA_BR::~Inst_DS__DS_GWS_SEMA_BR()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_BR
-
-    void
-    Inst_DS__DS_GWS_SEMA_BR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_GWS_SEMA_P::Inst_DS__DS_GWS_SEMA_P(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_p")
-    {
-    } // Inst_DS__DS_GWS_SEMA_P
-
-    Inst_DS__DS_GWS_SEMA_P::~Inst_DS__DS_GWS_SEMA_P()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_P
-
-    void
-    Inst_DS__DS_GWS_SEMA_P::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_GWS_BARRIER::Inst_DS__DS_GWS_BARRIER(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_barrier")
-    {
-    } // Inst_DS__DS_GWS_BARRIER
-
-    Inst_DS__DS_GWS_BARRIER::~Inst_DS__DS_GWS_BARRIER()
-    {
-    } // ~Inst_DS__DS_GWS_BARRIER
-
-    void
-    Inst_DS__DS_GWS_BARRIER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_CONSUME::Inst_DS__DS_CONSUME(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_consume")
-    {
-    } // Inst_DS__DS_CONSUME
-
-    Inst_DS__DS_CONSUME::~Inst_DS__DS_CONSUME()
-    {
-    } // ~Inst_DS__DS_CONSUME
-
-    void
-    Inst_DS__DS_CONSUME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_APPEND::Inst_DS__DS_APPEND(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_append")
-    {
-    } // Inst_DS__DS_APPEND
-
-    Inst_DS__DS_APPEND::~Inst_DS__DS_APPEND()
-    {
-    } // ~Inst_DS__DS_APPEND
-
-    void
-    Inst_DS__DS_APPEND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_ORDERED_COUNT::Inst_DS__DS_ORDERED_COUNT(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_ordered_count")
-    {
-    } // Inst_DS__DS_ORDERED_COUNT
-
-    Inst_DS__DS_ORDERED_COUNT::~Inst_DS__DS_ORDERED_COUNT()
-    {
-    } // ~Inst_DS__DS_ORDERED_COUNT
-
-    void
-    Inst_DS__DS_ORDERED_COUNT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_ADD_SRC2_U64::Inst_DS__DS_ADD_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_u64")
-    {
-    } // Inst_DS__DS_ADD_SRC2_U64
-
-    Inst_DS__DS_ADD_SRC2_U64::~Inst_DS__DS_ADD_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_U64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] + MEM[B].
-    void
-    Inst_DS__DS_ADD_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_SUB_SRC2_U64::Inst_DS__DS_SUB_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_src2_u64")
-    {
-    } // Inst_DS__DS_SUB_SRC2_U64
-
-    Inst_DS__DS_SUB_SRC2_U64::~Inst_DS__DS_SUB_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_SUB_SRC2_U64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] - MEM[B].
-    void
-    Inst_DS__DS_SUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_RSUB_SRC2_U64::Inst_DS__DS_RSUB_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_src2_u64")
-    {
-    } // Inst_DS__DS_RSUB_SRC2_U64
-
-    Inst_DS__DS_RSUB_SRC2_U64::~Inst_DS__DS_RSUB_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_SRC2_U64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] - MEM[A].
-    void
-    Inst_DS__DS_RSUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_INC_SRC2_U64::Inst_DS__DS_INC_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_src2_u64")
-    {
-    } // Inst_DS__DS_INC_SRC2_U64
-
-    Inst_DS__DS_INC_SRC2_U64::~Inst_DS__DS_INC_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_INC_SRC2_U64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
-    void
-    Inst_DS__DS_INC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_DEC_SRC2_U64::Inst_DS__DS_DEC_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_src2_u64")
-    {
-    } // Inst_DS__DS_DEC_SRC2_U64
-
-    Inst_DS__DS_DEC_SRC2_U64::~Inst_DS__DS_DEC_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_DEC_SRC2_U64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
-    // Uint decrement.
-    void
-    Inst_DS__DS_DEC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_SRC2_I64::Inst_DS__DS_MIN_SRC2_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_i64")
-    {
-    } // Inst_DS__DS_MIN_SRC2_I64
-
-    Inst_DS__DS_MIN_SRC2_I64::~Inst_DS__DS_MIN_SRC2_I64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_I64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_SRC2_I64::Inst_DS__DS_MAX_SRC2_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_i64")
-    {
-    } // Inst_DS__DS_MAX_SRC2_I64
-
-    Inst_DS__DS_MAX_SRC2_I64::~Inst_DS__DS_MAX_SRC2_I64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_I64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_SRC2_U64::Inst_DS__DS_MIN_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_u64")
-    {
-    } // Inst_DS__DS_MIN_SRC2_U64
-
-    Inst_DS__DS_MIN_SRC2_U64::~Inst_DS__DS_MIN_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_U64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_SRC2_U64::Inst_DS__DS_MAX_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_u64")
-    {
-    } // Inst_DS__DS_MAX_SRC2_U64
-
-    Inst_DS__DS_MAX_SRC2_U64::~Inst_DS__DS_MAX_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_U64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_AND_SRC2_B64::Inst_DS__DS_AND_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_src2_b64")
-    {
-    } // Inst_DS__DS_AND_SRC2_B64
-
-    Inst_DS__DS_AND_SRC2_B64::~Inst_DS__DS_AND_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_AND_SRC2_B64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] & MEM[B].
-    void
-    Inst_DS__DS_AND_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_OR_SRC2_B64::Inst_DS__DS_OR_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_src2_b64")
-    {
-    } // Inst_DS__DS_OR_SRC2_B64
-
-    Inst_DS__DS_OR_SRC2_B64::~Inst_DS__DS_OR_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_OR_SRC2_B64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] | MEM[B].
-    void
-    Inst_DS__DS_OR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_XOR_SRC2_B64::Inst_DS__DS_XOR_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_src2_b64")
-    {
-    } // Inst_DS__DS_XOR_SRC2_B64
-
-    Inst_DS__DS_XOR_SRC2_B64::~Inst_DS__DS_XOR_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_XOR_SRC2_B64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] ^ MEM[B].
-    void
-    Inst_DS__DS_XOR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRITE_SRC2_B64::Inst_DS__DS_WRITE_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_src2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_SRC2_B64
-
-    Inst_DS__DS_WRITE_SRC2_B64::~Inst_DS__DS_WRITE_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_WRITE_SRC2_B64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B].
-    // Write qword.
-    void
-    Inst_DS__DS_WRITE_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MIN_SRC2_F64::Inst_DS__DS_MIN_SRC2_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_SRC2_F64
-
-    Inst_DS__DS_MIN_SRC2_F64::~Inst_DS__DS_MIN_SRC2_F64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_F64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
-    void
-    Inst_DS__DS_MIN_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_MAX_SRC2_F64::Inst_DS__DS_MAX_SRC2_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_SRC2_F64
-
-    Inst_DS__DS_MAX_SRC2_F64::~Inst_DS__DS_MAX_SRC2_F64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_F64
-
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
-    void
-    Inst_DS__DS_MAX_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_DS__DS_WRITE_B96::Inst_DS__DS_WRITE_B96(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b96")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B96
-
-    Inst_DS__DS_WRITE_B96::~Inst_DS__DS_WRITE_B96()
-    {
-    } // ~Inst_DS__DS_WRITE_B96
-
-    // {MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[95:0].
-    // Tri-dword write.
-    void
-    Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
-
-        addr.read();
-        data0.read();
-        data1.read();
-        data2.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<3>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b128")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B128
-
-    Inst_DS__DS_WRITE_B128::~Inst_DS__DS_WRITE_B128()
-    {
-    } // ~Inst_DS__DS_WRITE_B128
-
-    // {MEM[ADDR + 12], MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[127:0].
-    // Qword write.
-    void
-    Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
-
-        addr.read();
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<4>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b96")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B96
-
-    Inst_DS__DS_READ_B96::~Inst_DS__DS_READ_B96()
-    {
-    } // ~Inst_DS__DS_READ_B96
-
-    // Tri-dword read.
-    void
-    Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<3>(gpuDynInst, offset);
-    }
-
-    void
-    Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    }
-
-    Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b128")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B128
-
-    Inst_DS__DS_READ_B128::~Inst_DS__DS_READ_B128()
-    {
-    } // ~Inst_DS__DS_READ_B128
-
-    // Qword read.
-    void
-    Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    }
-
-    void
-    Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<4>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::~Inst_MUBUF__BUFFER_LOAD_FORMAT_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-
-    // Untyped buffer load 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-
-    // Untyped buffer load 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-
-    // Untyped buffer load 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-
-    // Untyped buffer load 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_X
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::~Inst_MUBUF__BUFFER_STORE_FORMAT_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_X
-
-    // Untyped buffer store 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::~Inst_MUBUF__BUFFER_STORE_FORMAT_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-
-    // Untyped buffer store 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-
-    // Untyped buffer store 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-
-    // Untyped buffer store 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-
-    // Untyped buffer load 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-
-    // Untyped buffer load 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-
-    // Untyped buffer load 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-
-    // Untyped buffer load 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-
-    // Untyped buffer store 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-
-    // Untyped buffer store 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-
-    // Untyped buffer store 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-
-    // Untyped buffer store 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_LOAD_UBYTE
-        ::Inst_MUBUF__BUFFER_LOAD_UBYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_ubyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_UBYTE
-
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::~Inst_MUBUF__BUFFER_LOAD_UBYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_UBYTE
-
-    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        }
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
-                        gpuDynInst->d_data))[lane]);
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-
-    Inst_MUBUF__BUFFER_LOAD_SBYTE
-        ::Inst_MUBUF__BUFFER_LOAD_SBYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_sbyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_SBYTE
-
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::~Inst_MUBUF__BUFFER_LOAD_SBYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_SBYTE
-
-    // Untyped buffer load signed byte (sign extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_LOAD_USHORT
-        ::Inst_MUBUF__BUFFER_LOAD_USHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_ushort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_USHORT
-
-    Inst_MUBUF__BUFFER_LOAD_USHORT::~Inst_MUBUF__BUFFER_LOAD_USHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_USHORT
-
-    // Untyped buffer load unsigned short (zero extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        }
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
-                        gpuDynInst->d_data))[lane]);
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    }
-
-
-    Inst_MUBUF__BUFFER_LOAD_SSHORT
-        ::Inst_MUBUF__BUFFER_LOAD_SSHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_sshort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_SSHORT
-
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::~Inst_MUBUF__BUFFER_LOAD_SSHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_SSHORT
-
-    // Untyped buffer load signed short (sign extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_LOAD_DWORD
-        ::Inst_MUBUF__BUFFER_LOAD_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORD
-
-    Inst_MUBUF__BUFFER_LOAD_DWORD::~Inst_MUBUF__BUFFER_LOAD_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORD
-
-    // Untyped buffer load dword.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        }
-    }
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX2
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX2
-
-    // Untyped buffer load 2 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 2];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 2 + 1];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX3(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX3
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX3
-
-    // Untyped buffer load 3 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3 + 1];
-                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3 + 2];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                    vdst2[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    } // completeAcc
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX4(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX4
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX4
-
-    // Untyped buffer load 4 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 1];
-                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 2];
-                    vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 3];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                    vdst2[lane] = 0;
-                    vdst3[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-
-    Inst_MUBUF__BUFFER_STORE_BYTE
-        ::Inst_MUBUF__BUFFER_STORE_BYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_byte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_BYTE
-
-    Inst_MUBUF__BUFFER_STORE_BYTE::~Inst_MUBUF__BUFFER_STORE_BYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_BYTE
-
-    // Untyped buffer store byte.
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemI8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        }
-    }
-
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemI8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_STORE_SHORT
-        ::Inst_MUBUF__BUFFER_STORE_SHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_short")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_SHORT
-
-    Inst_MUBUF__BUFFER_STORE_SHORT::~Inst_MUBUF__BUFFER_STORE_SHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_SHORT
-
-    // Untyped buffer store short.
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandI16 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemI16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        }
-    }
-
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemI16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MUBUF__BUFFER_STORE_DWORD::
-        Inst_MUBUF__BUFFER_STORE_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORD
-
-    Inst_MUBUF__BUFFER_STORE_DWORD::~Inst_MUBUF__BUFFER_STORE_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORD
-
-    // Untyped buffer store dword.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        int inst_offset = instData.OFFSET;
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        }
-    }
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX2
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX2
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX2
-
-    // Untyped buffer store 2 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX3
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX3(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX3
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX3
-
-    // Untyped buffer store 3 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-        data2.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
-                    = data2[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX4
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX4(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX4
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX4
-
-    // Untyped buffer store 4 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        if (isLocalMem()) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
-                    = data2[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 3]
-                    = data3[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-        ::Inst_MUBUF__BUFFER_STORE_LDS_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_lds_dword")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::~Inst_MUBUF__BUFFER_STORE_LDS_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-
-    // Store one DWORD from LDS memory to system memory without utilizing
-    // VGPRs.
-    void
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_WBINVL1::Inst_MUBUF__BUFFER_WBINVL1(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_wbinvl1")
-    {
-        setFlag(MemoryRef);
-        setFlag(GPUStaticInst::MemSync);
-        setFlag(GlobalSegment);
-        setFlag(MemSync);
-    } // Inst_MUBUF__BUFFER_WBINVL1
-
-    Inst_MUBUF__BUFFER_WBINVL1::~Inst_MUBUF__BUFFER_WBINVL1()
-    {
-    } // ~Inst_MUBUF__BUFFER_WBINVL1
-
-    // Write back and invalidate the shader L1.
-    // Always returns ACK to shader.
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        injectGlobalMemFence(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_MUBUF__BUFFER_WBINVL1_VOL
-        ::Inst_MUBUF__BUFFER_WBINVL1_VOL(InFmt_MUBUF*iFmt)
-        : Inst_MUBUF(iFmt, "buffer_wbinvl1_vol") {
-        /**
-         * This instruction is same as buffer_wbinvl1 instruction except this
-         * instruction only invalidate L1 shader line with MTYPE for system
-         * or group coherence. Since L1 do not differentiate between its cache
-         * lines, this instruction currently behaves (and implemented )
-         * exactly like buffer_wbinvl1 instruction.
-         */
-        setFlag(MemoryRef);
-        setFlag(GPUStaticInst::MemSync);
-        setFlag(GlobalSegment);
-        setFlag(MemSync);
-    } // Inst_MUBUF__BUFFER_WBINVL1_VOL
-
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::~Inst_MUBUF__BUFFER_WBINVL1_VOL()
-    {
-    } // ~Inst_MUBUF__BUFFER_WBINVL1_VOL
-
-    // Write back and invalidate the shader L1 only for lines that are marked
-    // volatile. Always returns ACK to shader.
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        injectGlobalMemFence(gpuDynInst);
-    } // initiateAcc
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP
-        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        } // if
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP::~Inst_MUBUF__BUFFER_ATOMIC_SWAP()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD
-        ::Inst_MUBUF__BUFFER_ATOMIC_ADD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        } // if
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_ADD
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD::~Inst_MUBUF__BUFFER_ATOMIC_ADD()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB
-        ::Inst_MUBUF__BUFFER_ATOMIC_SUB(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SUB
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB::~Inst_MUBUF__BUFFER_ATOMIC_SUB()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN::~Inst_MUBUF__BUFFER_ATOMIC_SMIN()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN::~Inst_MUBUF__BUFFER_ATOMIC_UMIN()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX::~Inst_MUBUF__BUFFER_ATOMIC_SMAX()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        } // if
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX::~Inst_MUBUF__BUFFER_ATOMIC_UMAX()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND
-        ::Inst_MUBUF__BUFFER_ATOMIC_AND(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_AND
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND::~Inst_MUBUF__BUFFER_ATOMIC_AND()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR
-        ::Inst_MUBUF__BUFFER_ATOMIC_OR(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_OR
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR::~Inst_MUBUF__BUFFER_ATOMIC_OR()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR
-        ::Inst_MUBUF__BUFFER_ATOMIC_XOR(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_XOR
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR::~Inst_MUBUF__BUFFER_ATOMIC_XOR()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC
-        ::Inst_MUBUF__BUFFER_ATOMIC_INC(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_INC
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC::~Inst_MUBUF__BUFFER_ATOMIC_INC()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC
-        ::Inst_MUBUF__BUFFER_ATOMIC_DEC(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_DEC
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC::~Inst_MUBUF__BUFFER_ATOMIC_DEC()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_swap_x2")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap_x2")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-        ::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-
-    // tmp = MEM[ADDR];
-    // src = DATA[0:1];
-    // cmp = DATA[2:3];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_ADD_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_add_x2")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SUB_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_sub_x2")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_AND_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_and_x2")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::~Inst_MUBUF__BUFFER_ATOMIC_AND_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_OR_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_or_x2")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::~Inst_MUBUF__BUFFER_ATOMIC_OR_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_XOR_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_xor_x2")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_INC_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_inc_x2")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::~Inst_MUBUF__BUFFER_ATOMIC_INC_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_DEC_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_dec_x2")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-
-    // Typed buffer load 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-
-    // Typed buffer load 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-
-    // Typed buffer load 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-
-    // Typed buffer load 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::~Inst_MTBUF__TBUFFER_STORE_FORMAT_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-
-    // Typed buffer store 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-
-    // Typed buffer store 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-
-    // Typed buffer store 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-
-    // Typed buffer store 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-
-    // Typed buffer load 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::initiateAcc(
-          GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-
-    // Typed buffer load 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(
-          InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-
-    // Typed buffer load 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(
-          InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-
-    // Typed buffer load 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-
-    // Typed buffer store 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-
-    // Typed buffer store 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-
-    // Typed buffer store 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
-          GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-
-    // Typed buffer store 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::execute(
-        GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MIMG__IMAGE_LOAD::Inst_MIMG__IMAGE_LOAD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD
-
-    Inst_MIMG__IMAGE_LOAD::~Inst_MIMG__IMAGE_LOAD()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD
-
-    // Image memory load with format conversion specified
-    void
-    Inst_MIMG__IMAGE_LOAD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MIMG__IMAGE_LOAD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MIMG__IMAGE_LOAD_MIP::Inst_MIMG__IMAGE_LOAD_MIP(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP
-
-    Inst_MIMG__IMAGE_LOAD_MIP::~Inst_MIMG__IMAGE_LOAD_MIP()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MIMG__IMAGE_LOAD_PCK::Inst_MIMG__IMAGE_LOAD_PCK(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_PCK
-
-    Inst_MIMG__IMAGE_LOAD_PCK::~Inst_MIMG__IMAGE_LOAD_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_PCK
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::Inst_MIMG__IMAGE_LOAD_PCK_SGN(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_pck_sgn")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_PCK_SGN
-
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_PCK_SGN()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_PCK_SGN
-
-    // Image memory load with with no format conversion and sign extension
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::Inst_MIMG__IMAGE_LOAD_MIP_PCK(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::~Inst_MIMG__IMAGE_LOAD_MIP_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK
-
-    // Image memory load with user-supplied mip level, no format conversion
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip_pck_sgn")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
-
-    // Image memory load with user-supplied mip level, no format conversion.
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MIMG__IMAGE_STORE::Inst_MIMG__IMAGE_STORE(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE
-
-    Inst_MIMG__IMAGE_STORE::~Inst_MIMG__IMAGE_STORE()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE
-
-    // Image memory store with format conversion specified
-    void
-    Inst_MIMG__IMAGE_STORE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MIMG__IMAGE_STORE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MIMG__IMAGE_STORE_MIP::Inst_MIMG__IMAGE_STORE_MIP(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_mip")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_MIP
-
-    Inst_MIMG__IMAGE_STORE_MIP::~Inst_MIMG__IMAGE_STORE_MIP()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_MIP
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MIMG__IMAGE_STORE_PCK::Inst_MIMG__IMAGE_STORE_PCK(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_PCK
-
-    Inst_MIMG__IMAGE_STORE_PCK::~Inst_MIMG__IMAGE_STORE_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_PCK
-
-    // Image memory store of packed data without format conversion.
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::Inst_MIMG__IMAGE_STORE_MIP_PCK(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_mip_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_MIP_PCK
-
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::~Inst_MIMG__IMAGE_STORE_MIP_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_MIP_PCK
-
-    // Image memory store of packed data without format conversion
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_MIMG__IMAGE_GET_RESINFO::Inst_MIMG__IMAGE_GET_RESINFO(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_get_resinfo")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GET_RESINFO
-
-    Inst_MIMG__IMAGE_GET_RESINFO::~Inst_MIMG__IMAGE_GET_RESINFO()
-    {
-    } // ~Inst_MIMG__IMAGE_GET_RESINFO
-
-    void
-    Inst_MIMG__IMAGE_GET_RESINFO::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::Inst_MIMG__IMAGE_ATOMIC_SWAP(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SWAP
-
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::~Inst_MIMG__IMAGE_ATOMIC_SWAP()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SWAP
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::Inst_MIMG__IMAGE_ATOMIC_CMPSWAP(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
-
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
-
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_ADD::Inst_MIMG__IMAGE_ATOMIC_ADD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_ADD
-
-    Inst_MIMG__IMAGE_ATOMIC_ADD::~Inst_MIMG__IMAGE_ATOMIC_ADD()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_ADD
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_SUB::Inst_MIMG__IMAGE_ATOMIC_SUB(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SUB
-
-    Inst_MIMG__IMAGE_ATOMIC_SUB::~Inst_MIMG__IMAGE_ATOMIC_SUB()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SUB
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::Inst_MIMG__IMAGE_ATOMIC_SMIN(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SMIN
-
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::~Inst_MIMG__IMAGE_ATOMIC_SMIN()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SMIN
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::Inst_MIMG__IMAGE_ATOMIC_UMIN(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_UMIN
-
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::~Inst_MIMG__IMAGE_ATOMIC_UMIN()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_UMIN
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::Inst_MIMG__IMAGE_ATOMIC_SMAX(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SMAX
-
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::~Inst_MIMG__IMAGE_ATOMIC_SMAX()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SMAX
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::Inst_MIMG__IMAGE_ATOMIC_UMAX(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_UMAX
-
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::~Inst_MIMG__IMAGE_ATOMIC_UMAX()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_UMAX
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_AND::Inst_MIMG__IMAGE_ATOMIC_AND(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_AND
-
-    Inst_MIMG__IMAGE_ATOMIC_AND::~Inst_MIMG__IMAGE_ATOMIC_AND()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_AND
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_OR::Inst_MIMG__IMAGE_ATOMIC_OR(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_OR
-
-    Inst_MIMG__IMAGE_ATOMIC_OR::~Inst_MIMG__IMAGE_ATOMIC_OR()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_OR
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_XOR::Inst_MIMG__IMAGE_ATOMIC_XOR(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_XOR
-
-    Inst_MIMG__IMAGE_ATOMIC_XOR::~Inst_MIMG__IMAGE_ATOMIC_XOR()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_XOR
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_INC::Inst_MIMG__IMAGE_ATOMIC_INC(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_INC
-
-    Inst_MIMG__IMAGE_ATOMIC_INC::~Inst_MIMG__IMAGE_ATOMIC_INC()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_INC
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_ATOMIC_DEC::Inst_MIMG__IMAGE_ATOMIC_DEC(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_DEC
-
-    Inst_MIMG__IMAGE_ATOMIC_DEC::~Inst_MIMG__IMAGE_ATOMIC_DEC()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_DEC
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE::Inst_MIMG__IMAGE_SAMPLE(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE
-
-    Inst_MIMG__IMAGE_SAMPLE::~Inst_MIMG__IMAGE_SAMPLE()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_CL::Inst_MIMG__IMAGE_SAMPLE_CL(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_CL::~Inst_MIMG__IMAGE_SAMPLE_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CL
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_D::Inst_MIMG__IMAGE_SAMPLE_D(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D
-
-    Inst_MIMG__IMAGE_SAMPLE_D::~Inst_MIMG__IMAGE_SAMPLE_D()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::Inst_MIMG__IMAGE_SAMPLE_D_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::~Inst_MIMG__IMAGE_SAMPLE_D_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_L::Inst_MIMG__IMAGE_SAMPLE_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_L
-
-    Inst_MIMG__IMAGE_SAMPLE_L::~Inst_MIMG__IMAGE_SAMPLE_L()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_L
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_B::Inst_MIMG__IMAGE_SAMPLE_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B
-
-    Inst_MIMG__IMAGE_SAMPLE_B::~Inst_MIMG__IMAGE_SAMPLE_B()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::Inst_MIMG__IMAGE_SAMPLE_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::~Inst_MIMG__IMAGE_SAMPLE_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ::Inst_MIMG__IMAGE_SAMPLE_LZ(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_LZ
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ::~Inst_MIMG__IMAGE_SAMPLE_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C::Inst_MIMG__IMAGE_SAMPLE_C(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C
-
-    Inst_MIMG__IMAGE_SAMPLE_C::~Inst_MIMG__IMAGE_SAMPLE_C()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::Inst_MIMG__IMAGE_SAMPLE_C_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D::Inst_MIMG__IMAGE_SAMPLE_C_D(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D::~Inst_MIMG__IMAGE_SAMPLE_C_D()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::Inst_MIMG__IMAGE_SAMPLE_C_D_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L::Inst_MIMG__IMAGE_SAMPLE_C_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_L
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L::~Inst_MIMG__IMAGE_SAMPLE_C_L()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B::Inst_MIMG__IMAGE_SAMPLE_C_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B::~Inst_MIMG__IMAGE_SAMPLE_C_B()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::Inst_MIMG__IMAGE_SAMPLE_C_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::Inst_MIMG__IMAGE_SAMPLE_C_LZ(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::~Inst_MIMG__IMAGE_SAMPLE_C_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_O::Inst_MIMG__IMAGE_SAMPLE_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_O
-
-    Inst_MIMG__IMAGE_SAMPLE_O::~Inst_MIMG__IMAGE_SAMPLE_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::Inst_MIMG__IMAGE_SAMPLE_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CL_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_D_O::Inst_MIMG__IMAGE_SAMPLE_D_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_O
-
-    Inst_MIMG__IMAGE_SAMPLE_D_O::~Inst_MIMG__IMAGE_SAMPLE_D_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_D_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_D_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_L_O::Inst_MIMG__IMAGE_SAMPLE_L_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_L_O
-
-    Inst_MIMG__IMAGE_SAMPLE_L_O::~Inst_MIMG__IMAGE_SAMPLE_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_L_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_B_O::Inst_MIMG__IMAGE_SAMPLE_B_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_O
-
-    Inst_MIMG__IMAGE_SAMPLE_B_O::~Inst_MIMG__IMAGE_SAMPLE_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::Inst_MIMG__IMAGE_SAMPLE_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_LZ_O
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_O::Inst_MIMG__IMAGE_SAMPLE_C_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_O::~Inst_MIMG__IMAGE_SAMPLE_C_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::Inst_MIMG__IMAGE_SAMPLE_C_D_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::Inst_MIMG__IMAGE_SAMPLE_C_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_L_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::~Inst_MIMG__IMAGE_SAMPLE_C_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::Inst_MIMG__IMAGE_SAMPLE_C_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::Inst_MIMG__IMAGE_SAMPLE_C_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4::Inst_MIMG__IMAGE_GATHER4(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4
-
-    Inst_MIMG__IMAGE_GATHER4::~Inst_MIMG__IMAGE_GATHER4()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4
-
-    void
-    Inst_MIMG__IMAGE_GATHER4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_CL::Inst_MIMG__IMAGE_GATHER4_CL(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_CL
-
-    Inst_MIMG__IMAGE_GATHER4_CL::~Inst_MIMG__IMAGE_GATHER4_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_CL
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_L::Inst_MIMG__IMAGE_GATHER4_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_L
-
-    Inst_MIMG__IMAGE_GATHER4_L::~Inst_MIMG__IMAGE_GATHER4_L()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_L
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_B::Inst_MIMG__IMAGE_GATHER4_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B
-
-    Inst_MIMG__IMAGE_GATHER4_B::~Inst_MIMG__IMAGE_GATHER4_B()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL::Inst_MIMG__IMAGE_GATHER4_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_CL
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL::~Inst_MIMG__IMAGE_GATHER4_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_LZ::Inst_MIMG__IMAGE_GATHER4_LZ(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_LZ
-
-    Inst_MIMG__IMAGE_GATHER4_LZ::~Inst_MIMG__IMAGE_GATHER4_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_LZ
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C::Inst_MIMG__IMAGE_GATHER4_C(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C
-
-    Inst_MIMG__IMAGE_GATHER4_C::~Inst_MIMG__IMAGE_GATHER4_C()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL::Inst_MIMG__IMAGE_GATHER4_C_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_CL
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL::~Inst_MIMG__IMAGE_GATHER4_C_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C_L::Inst_MIMG__IMAGE_GATHER4_C_L(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_L
-
-    Inst_MIMG__IMAGE_GATHER4_C_L::~Inst_MIMG__IMAGE_GATHER4_C_L()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_L
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C_B::Inst_MIMG__IMAGE_GATHER4_C_B(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B
-
-    Inst_MIMG__IMAGE_GATHER4_C_B::~Inst_MIMG__IMAGE_GATHER4_C_B()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::Inst_MIMG__IMAGE_GATHER4_C_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::~Inst_MIMG__IMAGE_GATHER4_C_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::Inst_MIMG__IMAGE_GATHER4_C_LZ(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_LZ
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::~Inst_MIMG__IMAGE_GATHER4_C_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_O::Inst_MIMG__IMAGE_GATHER4_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_O
-
-    Inst_MIMG__IMAGE_GATHER4_O::~Inst_MIMG__IMAGE_GATHER4_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_CL_O::Inst_MIMG__IMAGE_GATHER4_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_CL_O::~Inst_MIMG__IMAGE_GATHER4_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_CL_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_L_O::Inst_MIMG__IMAGE_GATHER4_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_L_O
-
-    Inst_MIMG__IMAGE_GATHER4_L_O::~Inst_MIMG__IMAGE_GATHER4_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_L_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_B_O::Inst_MIMG__IMAGE_GATHER4_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_O
-
-    Inst_MIMG__IMAGE_GATHER4_B_O::~Inst_MIMG__IMAGE_GATHER4_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::Inst_MIMG__IMAGE_GATHER4_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::Inst_MIMG__IMAGE_GATHER4_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_LZ_O
-
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::~Inst_MIMG__IMAGE_GATHER4_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_LZ_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C_O::Inst_MIMG__IMAGE_GATHER4_C_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_O::~Inst_MIMG__IMAGE_GATHER4_C_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::Inst_MIMG__IMAGE_GATHER4_C_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::Inst_MIMG__IMAGE_GATHER4_C_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_L_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::~Inst_MIMG__IMAGE_GATHER4_C_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_L_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::Inst_MIMG__IMAGE_GATHER4_C_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::~Inst_MIMG__IMAGE_GATHER4_C_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::Inst_MIMG__IMAGE_GATHER4_C_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::Inst_MIMG__IMAGE_GATHER4_C_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_LZ_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::~Inst_MIMG__IMAGE_GATHER4_C_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ_O
-
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_GET_LOD::Inst_MIMG__IMAGE_GET_LOD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_get_lod")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GET_LOD
-
-    Inst_MIMG__IMAGE_GET_LOD::~Inst_MIMG__IMAGE_GET_LOD()
-    {
-    } // ~Inst_MIMG__IMAGE_GET_LOD
-
-    void
-    Inst_MIMG__IMAGE_GET_LOD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_CD::Inst_MIMG__IMAGE_SAMPLE_CD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD
-
-    Inst_MIMG__IMAGE_SAMPLE_CD::~Inst_MIMG__IMAGE_SAMPLE_CD()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::Inst_MIMG__IMAGE_SAMPLE_CD_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_CD_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::Inst_MIMG__IMAGE_SAMPLE_C_CD(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::~Inst_MIMG__IMAGE_SAMPLE_C_CD()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::Inst_MIMG__IMAGE_SAMPLE_CD_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::~Inst_MIMG__IMAGE_SAMPLE_CD_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_CD_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
-
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
-        : Inst_EXP(iFmt, "exp")
-    {
-    } // Inst_EXP__EXP
-
-    Inst_EXP__EXP::~Inst_EXP__EXP()
-    {
-    } // ~Inst_EXP__EXP
-
-    void
-    Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_LOAD_UBYTE::Inst_FLAT__FLAT_LOAD_UBYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_ubyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_UBYTE
-
-    Inst_FLAT__FLAT_LOAD_UBYTE::~Inst_FLAT__FLAT_LOAD_UBYTE()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_UBYTE
-
-    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->rdGmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-        vdst.write();
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_SBYTE class methods ---
-
-    Inst_FLAT__FLAT_LOAD_SBYTE::Inst_FLAT__FLAT_LOAD_SBYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_sbyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_SBYTE
-
-    Inst_FLAT__FLAT_LOAD_SBYTE::~Inst_FLAT__FLAT_LOAD_SBYTE()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_SBYTE
-
-    // Untyped buffer load signed byte (sign extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->rdGmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemI8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandI32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemI32)((reinterpret_cast<VecElemI8*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-        vdst.write();
-    }
-
-    Inst_FLAT__FLAT_LOAD_USHORT::Inst_FLAT__FLAT_LOAD_USHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_ushort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_USHORT
-
-    Inst_FLAT__FLAT_LOAD_USHORT::~Inst_FLAT__FLAT_LOAD_USHORT()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_USHORT
-
-    // Untyped buffer load unsigned short (zero extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->rdGmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-        vdst.write();
-    }
-
-
-    Inst_FLAT__FLAT_LOAD_SSHORT::Inst_FLAT__FLAT_LOAD_SSHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_sshort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_SSHORT
-
-    Inst_FLAT__FLAT_LOAD_SSHORT::~Inst_FLAT__FLAT_LOAD_SSHORT()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_SSHORT
-
-    // Untyped buffer load signed short (sign extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_FLAT__FLAT_LOAD_DWORD::Inst_FLAT__FLAT_LOAD_DWORD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORD
-
-    Inst_FLAT__FLAT_LOAD_DWORD::~Inst_FLAT__FLAT_LOAD_DWORD()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORD
-
-    // Untyped buffer load dword.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->rdGmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-        vdst.write();
-    } // completeAcc
-
-    Inst_FLAT__FLAT_LOAD_DWORDX2::Inst_FLAT__FLAT_LOAD_DWORDX2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX2
-
-    Inst_FLAT__FLAT_LOAD_DWORDX2::~Inst_FLAT__FLAT_LOAD_DWORDX2()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX2
-
-    // Untyped buffer load 2 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->rdGmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-        vdst.write();
-    } // completeAcc
-
-    Inst_FLAT__FLAT_LOAD_DWORDX3::Inst_FLAT__FLAT_LOAD_DWORDX3(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX3
-
-    Inst_FLAT__FLAT_LOAD_DWORDX3::~Inst_FLAT__FLAT_LOAD_DWORDX3()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX3
-
-    // Untyped buffer load 3 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->rdGmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 2];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    } // completeAcc
-
-    Inst_FLAT__FLAT_LOAD_DWORDX4::Inst_FLAT__FLAT_LOAD_DWORDX4(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX4
-
-    Inst_FLAT__FLAT_LOAD_DWORDX4::~Inst_FLAT__FLAT_LOAD_DWORDX4()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX4
-
-    // Untyped buffer load 4 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->rdGmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-
-    Inst_FLAT__FLAT_STORE_BYTE::Inst_FLAT__FLAT_STORE_BYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_byte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_BYTE
-
-    Inst_FLAT__FLAT_STORE_BYTE::~Inst_FLAT__FLAT_STORE_BYTE()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_BYTE
-
-    // Untyped buffer store byte.
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-
-    Inst_FLAT__FLAT_STORE_SHORT::Inst_FLAT__FLAT_STORE_SHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_short")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_SHORT
-
-    Inst_FLAT__FLAT_STORE_SHORT::~Inst_FLAT__FLAT_STORE_SHORT()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_SHORT
-
-    // Untyped buffer store short.
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU16 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORD
-
-    Inst_FLAT__FLAT_STORE_DWORD::~Inst_FLAT__FLAT_STORE_DWORD()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORD
-
-    // Untyped buffer store dword.
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_FLAT__FLAT_STORE_DWORDX2::Inst_FLAT__FLAT_STORE_DWORDX2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX2
-
-    Inst_FLAT__FLAT_STORE_DWORDX2::~Inst_FLAT__FLAT_STORE_DWORDX2()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX2
-
-    // Untyped buffer store 2 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_FLAT__FLAT_STORE_DWORDX3::Inst_FLAT__FLAT_STORE_DWORDX3(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX3
-
-    Inst_FLAT__FLAT_STORE_DWORDX3::~Inst_FLAT__FLAT_STORE_DWORDX3()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX3
-
-    // Untyped buffer store 3 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
-
-        addr.read();
-        data0.read();
-        data1.read();
-        data2.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 2] = data2[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_FLAT__FLAT_STORE_DWORDX4::Inst_FLAT__FLAT_STORE_DWORDX4(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX4
-
-    Inst_FLAT__FLAT_STORE_DWORDX4::~Inst_FLAT__FLAT_STORE_DWORDX4()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX4
-
-    // Untyped buffer store 4 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.DATA + 3);
-
-        addr.read();
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP::Inst_FLAT__FLAT_ATOMIC_SWAP(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        } // if
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SWAP
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP::~Inst_FLAT__FLAT_ATOMIC_SWAP()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->rdGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
-            gpuDynInst->executedAs() == enums::SC_PRIVATE) {
-            // TODO: additional address computation required for scratch
-            panic_if(gpuDynInst->executedAs() == enums::SC_PRIVATE,
-                     "Flats to private aperture not tested yet\n");
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-        ::Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        } // if
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->rdGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 cmp(gpuDynInst, extData.DATA + 1);
-
-        addr.read();
-        data.read();
-        cmp.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->x_data))[lane]
-                    = data[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = cmp[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
-            gpuDynInst->executedAs() == enums::SC_PRIVATE) {
-            /**
-             * TODO: If you encounter this panic, just remove this panic
-             * and restart the simulation. It should just work fine but
-             * this is to warn user that this path is never tested although
-             * all the necessary logic is implemented
-             */
-            panic_if(gpuDynInst->executedAs() == enums::SC_PRIVATE,
-                     "Flats to private aperture not tested yet\n");
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    Inst_FLAT__FLAT_ATOMIC_ADD::Inst_FLAT__FLAT_ATOMIC_ADD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        } // if
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD
-
-    Inst_FLAT__FLAT_ATOMIC_ADD::~Inst_FLAT__FLAT_ATOMIC_ADD()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->rdGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    Inst_FLAT__FLAT_ATOMIC_SUB::Inst_FLAT__FLAT_ATOMIC_SUB(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        } // if
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SUB
-
-    Inst_FLAT__FLAT_ATOMIC_SUB::~Inst_FLAT__FLAT_ATOMIC_SUB()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SUB
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->rdGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMIN
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN::~Inst_FLAT__FLAT_ATOMIC_SMIN()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN::Inst_FLAT__FLAT_ATOMIC_UMIN(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMIN
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN::~Inst_FLAT__FLAT_ATOMIC_UMIN()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX::Inst_FLAT__FLAT_ATOMIC_SMAX(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMAX
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX::~Inst_FLAT__FLAT_ATOMIC_SMAX()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX::Inst_FLAT__FLAT_ATOMIC_UMAX(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMAX
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX::~Inst_FLAT__FLAT_ATOMIC_UMAX()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_AND::Inst_FLAT__FLAT_ATOMIC_AND(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_AND
-
-    Inst_FLAT__FLAT_ATOMIC_AND::~Inst_FLAT__FLAT_ATOMIC_AND()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_AND
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_OR::Inst_FLAT__FLAT_ATOMIC_OR(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_OR
-
-    Inst_FLAT__FLAT_ATOMIC_OR::~Inst_FLAT__FLAT_ATOMIC_OR()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_OR
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_XOR::Inst_FLAT__FLAT_ATOMIC_XOR(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_XOR
-
-    Inst_FLAT__FLAT_ATOMIC_XOR::~Inst_FLAT__FLAT_ATOMIC_XOR()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_XOR
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_INC::Inst_FLAT__FLAT_ATOMIC_INC(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_INC
-
-    Inst_FLAT__FLAT_ATOMIC_INC::~Inst_FLAT__FLAT_ATOMIC_INC()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_INC
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->rdGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_DEC
-
-    Inst_FLAT__FLAT_ATOMIC_DEC::~Inst_FLAT__FLAT_ATOMIC_DEC()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_DEC
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->rdGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_swap_x2")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SWAP_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::~Inst_FLAT__FLAT_ATOMIC_SWAP_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_cmpswap_x2")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
-
-    // tmp = MEM[ADDR];
-    // src = DATA[0:1];
-    // cmp = DATA[2:3];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->rdGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-        ConstVecOperandU64 cmp(gpuDynInst, extData.DATA + 2);
-
-        addr.read();
-        data.read();
-        cmp.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->x_data))[lane]
-                    = data[lane];
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
-                    = cmp[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
-            gpuDynInst->executedAs() == enums::SC_PRIVATE) {
-            /**
-             * TODO: If you encounter this panic, just remove this panic
-             * and restart the simulation. It should just work fine but
-             * this is to warn user that this path is never tested although
-             * all the necessary logic is implemented
-             */
-            panic_if(gpuDynInst->executedAs() == enums::SC_PRIVATE,
-                     "Flats to private aperture not tested yet\n");
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::Inst_FLAT__FLAT_ATOMIC_ADD_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add_x2")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD_X2
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::~Inst_FLAT__FLAT_ATOMIC_ADD_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->rdGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::Inst_FLAT__FLAT_ATOMIC_SUB_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_sub_x2")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SUB_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::~Inst_FLAT__FLAT_ATOMIC_SUB_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SUB_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->rdGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMIN_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::~Inst_FLAT__FLAT_ATOMIC_SMIN_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::Inst_FLAT__FLAT_ATOMIC_UMIN_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMIN_X2
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::~Inst_FLAT__FLAT_ATOMIC_UMIN_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::Inst_FLAT__FLAT_ATOMIC_SMAX_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMAX_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::~Inst_FLAT__FLAT_ATOMIC_SMAX_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::Inst_FLAT__FLAT_ATOMIC_UMAX_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMAX_X2
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::~Inst_FLAT__FLAT_ATOMIC_UMAX_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::Inst_FLAT__FLAT_ATOMIC_AND_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_and_x2")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_AND_X2
-
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::~Inst_FLAT__FLAT_ATOMIC_AND_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_AND_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::Inst_FLAT__FLAT_ATOMIC_OR_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_or_x2")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_OR_X2
-
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::~Inst_FLAT__FLAT_ATOMIC_OR_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_OR_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::Inst_FLAT__FLAT_ATOMIC_XOR_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_xor_x2")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_XOR_X2
-
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::~Inst_FLAT__FLAT_ATOMIC_XOR_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_XOR_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    }
-
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::Inst_FLAT__FLAT_ATOMIC_INC_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_inc_x2")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_INC_X2
-
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::~Inst_FLAT__FLAT_ATOMIC_INC_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_INC_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->rdGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_dec_x2")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_DEC_X2
-
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::~Inst_FLAT__FLAT_ATOMIC_DEC_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_DEC_X2
-
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            wf->wrGmReqsInPipe--;
-            wf->rdGmReqsInPipe--;
-            wf->wrLmReqsInPipe--;
-            wf->rdLmReqsInPipe--;
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-            gpuDynInst->computeUnit()->localMemoryPipe
-                .issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    }
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-} // namespace Gcn3ISA
-} // namespace gem5
diff --git a/src/arch/amdgpu/gcn3/insts/instructions.hh b/src/arch/amdgpu/gcn3/insts/instructions.hh
deleted file mode 100644
index d1b7ccb60e..0000000000
--- a/src/arch/amdgpu/gcn3/insts/instructions.hh
+++ /dev/null
@@ -1,42723 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_INSTS_INSTRUCTIONS_HH__
-#define __ARCH_GCN3_INSTS_INSTRUCTIONS_HH__
-
-#include "arch/amdgpu/gcn3/gpu_decoder.hh"
-#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
-#include "arch/amdgpu/gcn3/insts/op_encodings.hh"
-#include "debug/GCN3.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    class Inst_SOP2__S_ADD_U32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_ADD_U32(InFmt_SOP2*);
-        ~Inst_SOP2__S_ADD_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_ADD_U32
-
-    class Inst_SOP2__S_SUB_U32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_SUB_U32(InFmt_SOP2*);
-        ~Inst_SOP2__S_SUB_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_SUB_U32
-
-    class Inst_SOP2__S_ADD_I32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_ADD_I32(InFmt_SOP2*);
-        ~Inst_SOP2__S_ADD_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_ADD_I32
-
-    class Inst_SOP2__S_SUB_I32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_SUB_I32(InFmt_SOP2*);
-        ~Inst_SOP2__S_SUB_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_SUB_I32
-
-    class Inst_SOP2__S_ADDC_U32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_ADDC_U32(InFmt_SOP2*);
-        ~Inst_SOP2__S_ADDC_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_ADDC_U32
-
-    class Inst_SOP2__S_SUBB_U32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_SUBB_U32(InFmt_SOP2*);
-        ~Inst_SOP2__S_SUBB_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_SUBB_U32
-
-    class Inst_SOP2__S_MIN_I32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_MIN_I32(InFmt_SOP2*);
-        ~Inst_SOP2__S_MIN_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_MIN_I32
-
-    class Inst_SOP2__S_MIN_U32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_MIN_U32(InFmt_SOP2*);
-        ~Inst_SOP2__S_MIN_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_MIN_U32
-
-    class Inst_SOP2__S_MAX_I32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_MAX_I32(InFmt_SOP2*);
-        ~Inst_SOP2__S_MAX_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_MAX_I32
-
-    class Inst_SOP2__S_MAX_U32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_MAX_U32(InFmt_SOP2*);
-        ~Inst_SOP2__S_MAX_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_MAX_U32
-
-    class Inst_SOP2__S_CSELECT_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_CSELECT_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_CSELECT_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_CSELECT_B32
-
-    class Inst_SOP2__S_CSELECT_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_CSELECT_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_CSELECT_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_CSELECT_B64
-
-    class Inst_SOP2__S_AND_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_AND_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_AND_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_AND_B32
-
-    class Inst_SOP2__S_AND_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_AND_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_AND_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_AND_B64
-
-    class Inst_SOP2__S_OR_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_OR_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_OR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_OR_B32
-
-    class Inst_SOP2__S_OR_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_OR_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_OR_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_OR_B64
-
-    class Inst_SOP2__S_XOR_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_XOR_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_XOR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_XOR_B32
-
-    class Inst_SOP2__S_XOR_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_XOR_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_XOR_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_XOR_B64
-
-    class Inst_SOP2__S_ANDN2_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_ANDN2_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_ANDN2_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_ANDN2_B32
-
-    class Inst_SOP2__S_ANDN2_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_ANDN2_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_ANDN2_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_ANDN2_B64
-
-    class Inst_SOP2__S_ORN2_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_ORN2_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_ORN2_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_ORN2_B32
-
-    class Inst_SOP2__S_ORN2_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_ORN2_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_ORN2_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_ORN2_B64
-
-    class Inst_SOP2__S_NAND_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_NAND_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_NAND_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_NAND_B32
-
-    class Inst_SOP2__S_NAND_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_NAND_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_NAND_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_NAND_B64
-
-    class Inst_SOP2__S_NOR_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_NOR_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_NOR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_NOR_B32
-
-    class Inst_SOP2__S_NOR_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_NOR_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_NOR_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_NOR_B64
-
-    class Inst_SOP2__S_XNOR_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_XNOR_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_XNOR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_XNOR_B32
-
-    class Inst_SOP2__S_XNOR_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_XNOR_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_XNOR_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_XNOR_B64
-
-    class Inst_SOP2__S_LSHL_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_LSHL_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_LSHL_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_LSHL_B32
-
-    class Inst_SOP2__S_LSHL_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_LSHL_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_LSHL_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_LSHL_B64
-
-    class Inst_SOP2__S_LSHR_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_LSHR_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_LSHR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_LSHR_B32
-
-    class Inst_SOP2__S_LSHR_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_LSHR_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_LSHR_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_LSHR_B64
-
-    class Inst_SOP2__S_ASHR_I32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_ASHR_I32(InFmt_SOP2*);
-        ~Inst_SOP2__S_ASHR_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_ASHR_I32
-
-    class Inst_SOP2__S_ASHR_I64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_ASHR_I64(InFmt_SOP2*);
-        ~Inst_SOP2__S_ASHR_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_ASHR_I64
-
-    class Inst_SOP2__S_BFM_B32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_BFM_B32(InFmt_SOP2*);
-        ~Inst_SOP2__S_BFM_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_BFM_B32
-
-    class Inst_SOP2__S_BFM_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_BFM_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_BFM_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_BFM_B64
-
-    class Inst_SOP2__S_MUL_I32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_MUL_I32(InFmt_SOP2*);
-        ~Inst_SOP2__S_MUL_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_MUL_I32
-
-    class Inst_SOP2__S_BFE_U32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_BFE_U32(InFmt_SOP2*);
-        ~Inst_SOP2__S_BFE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_BFE_U32
-
-    class Inst_SOP2__S_BFE_I32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_BFE_I32(InFmt_SOP2*);
-        ~Inst_SOP2__S_BFE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_BFE_I32
-
-    class Inst_SOP2__S_BFE_U64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_BFE_U64(InFmt_SOP2*);
-        ~Inst_SOP2__S_BFE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_BFE_U64
-
-    class Inst_SOP2__S_BFE_I64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_BFE_I64(InFmt_SOP2*);
-        ~Inst_SOP2__S_BFE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_BFE_I64
-
-    class Inst_SOP2__S_CBRANCH_G_FORK : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_CBRANCH_G_FORK(InFmt_SOP2*);
-        ~Inst_SOP2__S_CBRANCH_G_FORK();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_CBRANCH_G_FORK
-
-    class Inst_SOP2__S_ABSDIFF_I32 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_ABSDIFF_I32(InFmt_SOP2*);
-        ~Inst_SOP2__S_ABSDIFF_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_ABSDIFF_I32
-
-    class Inst_SOP2__S_RFE_RESTORE_B64 : public Inst_SOP2
-    {
-      public:
-        Inst_SOP2__S_RFE_RESTORE_B64(InFmt_SOP2*);
-        ~Inst_SOP2__S_RFE_RESTORE_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP2__S_RFE_RESTORE_B64
-
-    class Inst_SOPK__S_MOVK_I32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_MOVK_I32(InFmt_SOPK*);
-        ~Inst_SOPK__S_MOVK_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_MOVK_I32
-
-    class Inst_SOPK__S_CMOVK_I32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMOVK_I32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMOVK_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMOVK_I32
-
-    class Inst_SOPK__S_CMPK_EQ_I32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_EQ_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_EQ_I32
-
-    class Inst_SOPK__S_CMPK_LG_I32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_LG_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_LG_I32
-
-    class Inst_SOPK__S_CMPK_GT_I32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_GT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_GT_I32
-
-    class Inst_SOPK__S_CMPK_GE_I32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_GE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_GE_I32
-
-    class Inst_SOPK__S_CMPK_LT_I32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_LT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_LT_I32
-
-    class Inst_SOPK__S_CMPK_LE_I32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_LE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_LE_I32
-
-    class Inst_SOPK__S_CMPK_EQ_U32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_EQ_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_EQ_U32
-
-    class Inst_SOPK__S_CMPK_LG_U32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_LG_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_LG_U32
-
-    class Inst_SOPK__S_CMPK_GT_U32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_GT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_GT_U32
-
-    class Inst_SOPK__S_CMPK_GE_U32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_GE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_GE_U32
-
-    class Inst_SOPK__S_CMPK_LT_U32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_LT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_LT_U32
-
-    class Inst_SOPK__S_CMPK_LE_U32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK*);
-        ~Inst_SOPK__S_CMPK_LE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CMPK_LE_U32
-
-    class Inst_SOPK__S_ADDK_I32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_ADDK_I32(InFmt_SOPK*);
-        ~Inst_SOPK__S_ADDK_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_ADDK_I32
-
-    class Inst_SOPK__S_MULK_I32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_MULK_I32(InFmt_SOPK*);
-        ~Inst_SOPK__S_MULK_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_MULK_I32
-
-    class Inst_SOPK__S_CBRANCH_I_FORK : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK*);
-        ~Inst_SOPK__S_CBRANCH_I_FORK();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sdst
-                return 8;
-              case 1: //
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_CBRANCH_I_FORK
-
-    class Inst_SOPK__S_GETREG_B32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_GETREG_B32(InFmt_SOPK*);
-        ~Inst_SOPK__S_GETREG_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_GETREG_B32
-
-    class Inst_SOPK__S_SETREG_B32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_SETREG_B32(InFmt_SOPK*);
-        ~Inst_SOPK__S_SETREG_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_SETREG_B32
-
-    class Inst_SOPK__S_SETREG_IMM32_B32 : public Inst_SOPK
-    {
-      public:
-        Inst_SOPK__S_SETREG_IMM32_B32(InFmt_SOPK*);
-        ~Inst_SOPK__S_SETREG_IMM32_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm32
-                return 4;
-              case 1: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPK__S_SETREG_IMM32_B32
-
-    class Inst_SOP1__S_MOV_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_MOV_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_MOV_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_MOV_B32
-
-    class Inst_SOP1__S_MOV_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_MOV_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_MOV_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_MOV_B64
-
-    class Inst_SOP1__S_CMOV_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_CMOV_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_CMOV_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_CMOV_B32
-
-    class Inst_SOP1__S_CMOV_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_CMOV_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_CMOV_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_CMOV_B64
-
-    class Inst_SOP1__S_NOT_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_NOT_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_NOT_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_NOT_B32
-
-    class Inst_SOP1__S_NOT_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_NOT_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_NOT_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_NOT_B64
-
-    class Inst_SOP1__S_WQM_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_WQM_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_WQM_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_WQM_B32
-
-    class Inst_SOP1__S_WQM_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_WQM_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_WQM_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_WQM_B64
-
-    class Inst_SOP1__S_BREV_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_BREV_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_BREV_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_BREV_B32
-
-    class Inst_SOP1__S_BREV_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_BREV_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_BREV_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_BREV_B64
-
-    class Inst_SOP1__S_BCNT0_I32_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_BCNT0_I32_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_BCNT0_I32_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_BCNT0_I32_B32
-
-    class Inst_SOP1__S_BCNT0_I32_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_BCNT0_I32_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_BCNT0_I32_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_BCNT0_I32_B64
-
-    class Inst_SOP1__S_BCNT1_I32_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_BCNT1_I32_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_BCNT1_I32_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_BCNT1_I32_B32
-
-    class Inst_SOP1__S_BCNT1_I32_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_BCNT1_I32_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_BCNT1_I32_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_BCNT1_I32_B64
-
-    class Inst_SOP1__S_FF0_I32_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_FF0_I32_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_FF0_I32_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_FF0_I32_B32
-
-    class Inst_SOP1__S_FF0_I32_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_FF0_I32_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_FF0_I32_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_FF0_I32_B64
-
-    class Inst_SOP1__S_FF1_I32_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_FF1_I32_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_FF1_I32_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_FF1_I32_B32
-
-    class Inst_SOP1__S_FF1_I32_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_FF1_I32_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_FF1_I32_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_FF1_I32_B64
-
-    class Inst_SOP1__S_FLBIT_I32_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_FLBIT_I32_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_FLBIT_I32_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_FLBIT_I32_B32
-
-    class Inst_SOP1__S_FLBIT_I32_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_FLBIT_I32_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_FLBIT_I32_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_FLBIT_I32_B64
-
-    class Inst_SOP1__S_FLBIT_I32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_FLBIT_I32(InFmt_SOP1*);
-        ~Inst_SOP1__S_FLBIT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_FLBIT_I32
-
-    class Inst_SOP1__S_FLBIT_I32_I64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_FLBIT_I32_I64(InFmt_SOP1*);
-        ~Inst_SOP1__S_FLBIT_I32_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_FLBIT_I32_I64
-
-    class Inst_SOP1__S_SEXT_I32_I8 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_SEXT_I32_I8(InFmt_SOP1*);
-        ~Inst_SOP1__S_SEXT_I32_I8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_SEXT_I32_I8
-
-    class Inst_SOP1__S_SEXT_I32_I16 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_SEXT_I32_I16(InFmt_SOP1*);
-        ~Inst_SOP1__S_SEXT_I32_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_SEXT_I32_I16
-
-    class Inst_SOP1__S_BITSET0_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_BITSET0_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_BITSET0_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_BITSET0_B32
-
-    class Inst_SOP1__S_BITSET0_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_BITSET0_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_BITSET0_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_BITSET0_B64
-
-    class Inst_SOP1__S_BITSET1_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_BITSET1_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_BITSET1_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_BITSET1_B32
-
-    class Inst_SOP1__S_BITSET1_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_BITSET1_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_BITSET1_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_BITSET1_B64
-
-    class Inst_SOP1__S_GETPC_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_GETPC_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_GETPC_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_GETPC_B64
-
-    class Inst_SOP1__S_SETPC_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_SETPC_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_SETPC_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_SETPC_B64
-
-    class Inst_SOP1__S_SWAPPC_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_SWAPPC_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_SWAPPC_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_SWAPPC_B64
-
-    class Inst_SOP1__S_RFE_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_RFE_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_RFE_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_RFE_B64
-
-    class Inst_SOP1__S_AND_SAVEEXEC_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_AND_SAVEEXEC_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_AND_SAVEEXEC_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_AND_SAVEEXEC_B64
-
-    class Inst_SOP1__S_OR_SAVEEXEC_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_OR_SAVEEXEC_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_OR_SAVEEXEC_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_OR_SAVEEXEC_B64
-
-    class Inst_SOP1__S_XOR_SAVEEXEC_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_XOR_SAVEEXEC_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_XOR_SAVEEXEC_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_XOR_SAVEEXEC_B64
-
-    class Inst_SOP1__S_ANDN2_SAVEEXEC_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_ANDN2_SAVEEXEC_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_ANDN2_SAVEEXEC_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_ANDN2_SAVEEXEC_B64
-
-    class Inst_SOP1__S_ORN2_SAVEEXEC_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_ORN2_SAVEEXEC_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_ORN2_SAVEEXEC_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_ORN2_SAVEEXEC_B64
-
-    class Inst_SOP1__S_NAND_SAVEEXEC_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_NAND_SAVEEXEC_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_NAND_SAVEEXEC_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_NAND_SAVEEXEC_B64
-
-    class Inst_SOP1__S_NOR_SAVEEXEC_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_NOR_SAVEEXEC_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_NOR_SAVEEXEC_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_NOR_SAVEEXEC_B64
-
-    class Inst_SOP1__S_XNOR_SAVEEXEC_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_XNOR_SAVEEXEC_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_XNOR_SAVEEXEC_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_XNOR_SAVEEXEC_B64
-
-    class Inst_SOP1__S_QUADMASK_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_QUADMASK_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_QUADMASK_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_QUADMASK_B32
-
-    class Inst_SOP1__S_QUADMASK_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_QUADMASK_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_QUADMASK_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_QUADMASK_B64
-
-    class Inst_SOP1__S_MOVRELS_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_MOVRELS_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_MOVRELS_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_MOVRELS_B32
-
-    class Inst_SOP1__S_MOVRELS_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_MOVRELS_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_MOVRELS_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sdst
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_MOVRELS_B64
-
-    class Inst_SOP1__S_MOVRELD_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_MOVRELD_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_MOVRELD_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_MOVRELD_B32
-
-    class Inst_SOP1__S_MOVRELD_B64 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_MOVRELD_B64(InFmt_SOP1*);
-        ~Inst_SOP1__S_MOVRELD_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 8;
-              case 1: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_MOVRELD_B64
-
-    class Inst_SOP1__S_CBRANCH_JOIN : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_CBRANCH_JOIN(InFmt_SOP1*);
-        ~Inst_SOP1__S_CBRANCH_JOIN();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_CBRANCH_JOIN
-
-    class Inst_SOP1__S_ABS_I32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_ABS_I32(InFmt_SOP1*);
-        ~Inst_SOP1__S_ABS_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_ABS_I32
-
-    class Inst_SOP1__S_MOV_FED_B32 : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_MOV_FED_B32(InFmt_SOP1*);
-        ~Inst_SOP1__S_MOV_FED_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_MOV_FED_B32
-
-    class Inst_SOP1__S_SET_GPR_IDX_IDX : public Inst_SOP1
-    {
-      public:
-        Inst_SOP1__S_SET_GPR_IDX_IDX(InFmt_SOP1*);
-        ~Inst_SOP1__S_SET_GPR_IDX_IDX();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOP1__S_SET_GPR_IDX_IDX
-
-    class Inst_SOPC__S_CMP_EQ_I32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_EQ_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_EQ_I32
-
-    class Inst_SOPC__S_CMP_LG_I32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_LG_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_LG_I32
-
-    class Inst_SOPC__S_CMP_GT_I32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_GT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_GT_I32
-
-    class Inst_SOPC__S_CMP_GE_I32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_GE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_GE_I32
-
-    class Inst_SOPC__S_CMP_LT_I32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_LT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_LT_I32
-
-    class Inst_SOPC__S_CMP_LE_I32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_LE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_LE_I32
-
-    class Inst_SOPC__S_CMP_EQ_U32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_EQ_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_EQ_U32
-
-    class Inst_SOPC__S_CMP_LG_U32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_LG_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_LG_U32
-
-    class Inst_SOPC__S_CMP_GT_U32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_GT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_GT_U32
-
-    class Inst_SOPC__S_CMP_GE_U32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_GE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_GE_U32
-
-    class Inst_SOPC__S_CMP_LT_U32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_LT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_LT_U32
-
-    class Inst_SOPC__S_CMP_LE_U32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_LE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_LE_U32
-
-    class Inst_SOPC__S_BITCMP0_B32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC*);
-        ~Inst_SOPC__S_BITCMP0_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_BITCMP0_B32
-
-    class Inst_SOPC__S_BITCMP1_B32 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC*);
-        ~Inst_SOPC__S_BITCMP1_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_BITCMP1_B32
-
-    class Inst_SOPC__S_BITCMP0_B64 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC*);
-        ~Inst_SOPC__S_BITCMP0_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_BITCMP0_B64
-
-    class Inst_SOPC__S_BITCMP1_B64 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC*);
-        ~Inst_SOPC__S_BITCMP1_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_BITCMP1_B64
-
-    class Inst_SOPC__S_SETVSKIP : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_SETVSKIP(InFmt_SOPC*);
-        ~Inst_SOPC__S_SETVSKIP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_SETVSKIP
-
-    class Inst_SOPC__S_SET_GPR_IDX_ON : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC*);
-        ~Inst_SOPC__S_SET_GPR_IDX_ON();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //simm4
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_SET_GPR_IDX_ON
-
-    class Inst_SOPC__S_CMP_EQ_U64 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_EQ_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_EQ_U64
-
-    class Inst_SOPC__S_CMP_LG_U64 : public Inst_SOPC
-    {
-      public:
-        Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC*);
-        ~Inst_SOPC__S_CMP_LG_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 8;
-              case 1: //ssrc_1
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPC__S_CMP_LG_U64
-
-    class Inst_SOPP__S_NOP : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_NOP(InFmt_SOPP*);
-        ~Inst_SOPP__S_NOP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_NOP
-
-    class Inst_SOPP__S_ENDPGM : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_ENDPGM(InFmt_SOPP*);
-        ~Inst_SOPP__S_ENDPGM();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_ENDPGM
-
-    class Inst_SOPP__S_BRANCH : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_BRANCH(InFmt_SOPP*);
-        ~Inst_SOPP__S_BRANCH();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //label
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_BRANCH
-
-    class Inst_SOPP__S_WAKEUP : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_WAKEUP(InFmt_SOPP*);
-        ~Inst_SOPP__S_WAKEUP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_WAKEUP
-
-    class Inst_SOPP__S_CBRANCH_SCC0 : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP*);
-        ~Inst_SOPP__S_CBRANCH_SCC0();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //label
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_CBRANCH_SCC0
-
-    class Inst_SOPP__S_CBRANCH_SCC1 : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP*);
-        ~Inst_SOPP__S_CBRANCH_SCC1();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //label
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_CBRANCH_SCC1
-
-    class Inst_SOPP__S_CBRANCH_VCCZ : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP*);
-        ~Inst_SOPP__S_CBRANCH_VCCZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //label
-                return 2;
-              case 1:
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_CBRANCH_VCCZ
-
-    class Inst_SOPP__S_CBRANCH_VCCNZ : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP*);
-        ~Inst_SOPP__S_CBRANCH_VCCNZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //label
-                return 2;
-              case 1:
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_CBRANCH_VCCNZ
-
-    class Inst_SOPP__S_CBRANCH_EXECZ : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP*);
-        ~Inst_SOPP__S_CBRANCH_EXECZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //label
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_CBRANCH_EXECZ
-
-    class Inst_SOPP__S_CBRANCH_EXECNZ : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP*);
-        ~Inst_SOPP__S_CBRANCH_EXECNZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //label
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_CBRANCH_EXECNZ
-
-    class Inst_SOPP__S_BARRIER : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_BARRIER(InFmt_SOPP*);
-        ~Inst_SOPP__S_BARRIER();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_BARRIER
-
-    class Inst_SOPP__S_SETKILL : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_SETKILL(InFmt_SOPP*);
-        ~Inst_SOPP__S_SETKILL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_SETKILL
-
-    class Inst_SOPP__S_WAITCNT : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_WAITCNT(InFmt_SOPP*);
-        ~Inst_SOPP__S_WAITCNT();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_WAITCNT
-
-    class Inst_SOPP__S_SETHALT : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_SETHALT(InFmt_SOPP*);
-        ~Inst_SOPP__S_SETHALT();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_SETHALT
-
-    class Inst_SOPP__S_SLEEP : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_SLEEP(InFmt_SOPP*);
-        ~Inst_SOPP__S_SLEEP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_SLEEP
-
-    class Inst_SOPP__S_SETPRIO : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_SETPRIO(InFmt_SOPP*);
-        ~Inst_SOPP__S_SETPRIO();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_SETPRIO
-
-    class Inst_SOPP__S_SENDMSG : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_SENDMSG(InFmt_SOPP*);
-        ~Inst_SOPP__S_SENDMSG();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_SENDMSG
-
-    class Inst_SOPP__S_SENDMSGHALT : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP*);
-        ~Inst_SOPP__S_SENDMSGHALT();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_SENDMSGHALT
-
-    class Inst_SOPP__S_TRAP : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_TRAP(InFmt_SOPP*);
-        ~Inst_SOPP__S_TRAP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_TRAP
-
-    class Inst_SOPP__S_ICACHE_INV : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_ICACHE_INV(InFmt_SOPP*);
-        ~Inst_SOPP__S_ICACHE_INV();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_ICACHE_INV
-
-    class Inst_SOPP__S_INCPERFLEVEL : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP*);
-        ~Inst_SOPP__S_INCPERFLEVEL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_INCPERFLEVEL
-
-    class Inst_SOPP__S_DECPERFLEVEL : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP*);
-        ~Inst_SOPP__S_DECPERFLEVEL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_DECPERFLEVEL
-
-    class Inst_SOPP__S_TTRACEDATA : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_TTRACEDATA(InFmt_SOPP*);
-        ~Inst_SOPP__S_TTRACEDATA();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_TTRACEDATA
-
-    class Inst_SOPP__S_CBRANCH_CDBGSYS : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_CBRANCH_CDBGSYS(InFmt_SOPP*);
-        ~Inst_SOPP__S_CBRANCH_CDBGSYS();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //label
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_CBRANCH_CDBGSYS
-
-    class Inst_SOPP__S_CBRANCH_CDBGUSER : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_CBRANCH_CDBGUSER(InFmt_SOPP*);
-        ~Inst_SOPP__S_CBRANCH_CDBGUSER();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //label
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_CBRANCH_CDBGUSER
-
-    class Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(InFmt_SOPP*);
-        ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //label
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
-
-    class Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP*);
-        ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //label
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
-
-    class Inst_SOPP__S_ENDPGM_SAVED : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP*);
-        ~Inst_SOPP__S_ENDPGM_SAVED();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_ENDPGM_SAVED
-
-    class Inst_SOPP__S_SET_GPR_IDX_OFF : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_SET_GPR_IDX_OFF(InFmt_SOPP*);
-        ~Inst_SOPP__S_SET_GPR_IDX_OFF();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_SET_GPR_IDX_OFF
-
-    class Inst_SOPP__S_SET_GPR_IDX_MODE : public Inst_SOPP
-    {
-      public:
-        Inst_SOPP__S_SET_GPR_IDX_MODE(InFmt_SOPP*);
-        ~Inst_SOPP__S_SET_GPR_IDX_MODE();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //simm16
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SOPP__S_SET_GPR_IDX_MODE
-
-    class Inst_SMEM__S_LOAD_DWORD : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_LOAD_DWORD(InFmt_SMEM*);
-        ~Inst_SMEM__S_LOAD_DWORD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_base
-                return 8;
-              case 1: //offset
-                return 4;
-              case 2: //sgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_LOAD_DWORD
-
-    class Inst_SMEM__S_LOAD_DWORDX2 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_LOAD_DWORDX2(InFmt_SMEM*);
-        ~Inst_SMEM__S_LOAD_DWORDX2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_base
-                return 8;
-              case 1: //offset
-                return 4;
-              case 2: //sgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_LOAD_DWORDX2
-
-    class Inst_SMEM__S_LOAD_DWORDX4 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_LOAD_DWORDX4(InFmt_SMEM*);
-        ~Inst_SMEM__S_LOAD_DWORDX4();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_base
-                return 8;
-              case 1: //offset
-                return 4;
-              case 2: //sgpr_dst
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_LOAD_DWORDX4
-
-    class Inst_SMEM__S_LOAD_DWORDX8 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_LOAD_DWORDX8(InFmt_SMEM*);
-        ~Inst_SMEM__S_LOAD_DWORDX8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_base
-                return 8;
-              case 1: //offset
-                return 4;
-              case 2: //sgpr_dst
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_LOAD_DWORDX8
-
-    class Inst_SMEM__S_LOAD_DWORDX16 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_LOAD_DWORDX16(InFmt_SMEM*);
-        ~Inst_SMEM__S_LOAD_DWORDX16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_base
-                return 8;
-              case 1: //offset
-                return 4;
-              case 2: //sgpr_dst
-                return 64;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_LOAD_DWORDX16
-
-    class Inst_SMEM__S_BUFFER_LOAD_DWORD : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_BUFFER_LOAD_DWORD(InFmt_SMEM*);
-        ~Inst_SMEM__S_BUFFER_LOAD_DWORD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_base
-                return 16;
-              case 1: //offset
-                return 4;
-              case 2: //sgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_BUFFER_LOAD_DWORD
-
-    class Inst_SMEM__S_BUFFER_LOAD_DWORDX2 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_BUFFER_LOAD_DWORDX2(InFmt_SMEM*);
-        ~Inst_SMEM__S_BUFFER_LOAD_DWORDX2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_base
-                return 16;
-              case 1: //offset
-                return 4;
-              case 2: //sgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_BUFFER_LOAD_DWORDX2
-
-    class Inst_SMEM__S_BUFFER_LOAD_DWORDX4 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_BUFFER_LOAD_DWORDX4(InFmt_SMEM*);
-        ~Inst_SMEM__S_BUFFER_LOAD_DWORDX4();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_base
-                return 16;
-              case 1: //offset
-                return 4;
-              case 2: //sgpr_dst
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_BUFFER_LOAD_DWORDX4
-
-    class Inst_SMEM__S_BUFFER_LOAD_DWORDX8 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_BUFFER_LOAD_DWORDX8(InFmt_SMEM*);
-        ~Inst_SMEM__S_BUFFER_LOAD_DWORDX8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_base
-                return 16;
-              case 1: //offset
-                return 4;
-              case 2: //sgpr_dst
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_BUFFER_LOAD_DWORDX8
-
-    class Inst_SMEM__S_BUFFER_LOAD_DWORDX16 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_BUFFER_LOAD_DWORDX16(InFmt_SMEM*);
-        ~Inst_SMEM__S_BUFFER_LOAD_DWORDX16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_base
-                return 16;
-              case 1: //offset
-                return 4;
-              case 2: //sgpr_dst
-                return 64;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_BUFFER_LOAD_DWORDX16
-
-    class Inst_SMEM__S_STORE_DWORD : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_STORE_DWORD(InFmt_SMEM*);
-        ~Inst_SMEM__S_STORE_DWORD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_data
-                return 4;
-              case 1: //sgpr_base
-                return 8;
-              case 2: //offset
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_STORE_DWORD
-
-    class Inst_SMEM__S_STORE_DWORDX2 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_STORE_DWORDX2(InFmt_SMEM*);
-        ~Inst_SMEM__S_STORE_DWORDX2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_data
-                return 8;
-              case 1: //sgpr_base
-                return 8;
-              case 2: //offset
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_STORE_DWORDX2
-
-    class Inst_SMEM__S_STORE_DWORDX4 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_STORE_DWORDX4(InFmt_SMEM*);
-        ~Inst_SMEM__S_STORE_DWORDX4();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_data
-                return 16;
-              case 1: //sgpr_base
-                return 8;
-              case 2: //offset
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_STORE_DWORDX4
-
-    class Inst_SMEM__S_BUFFER_STORE_DWORD : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_BUFFER_STORE_DWORD(InFmt_SMEM*);
-        ~Inst_SMEM__S_BUFFER_STORE_DWORD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_data
-                return 4;
-              case 1: //sgpr_base
-                return 16;
-              case 2: //offset
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_BUFFER_STORE_DWORD
-
-    class Inst_SMEM__S_BUFFER_STORE_DWORDX2 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_BUFFER_STORE_DWORDX2(InFmt_SMEM*);
-        ~Inst_SMEM__S_BUFFER_STORE_DWORDX2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_data
-                return 8;
-              case 1: //sgpr_base
-                return 16;
-              case 2: //offset
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_BUFFER_STORE_DWORDX2
-
-    class Inst_SMEM__S_BUFFER_STORE_DWORDX4 : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_BUFFER_STORE_DWORDX4(InFmt_SMEM*);
-        ~Inst_SMEM__S_BUFFER_STORE_DWORDX4();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_data
-                return 16;
-              case 1: //sgpr_base
-                return 16;
-              case 2: //offset
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_BUFFER_STORE_DWORDX4
-
-    class Inst_SMEM__S_DCACHE_INV : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_DCACHE_INV(InFmt_SMEM*);
-        ~Inst_SMEM__S_DCACHE_INV();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_DCACHE_INV
-
-    class Inst_SMEM__S_DCACHE_WB : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_DCACHE_WB(InFmt_SMEM*);
-        ~Inst_SMEM__S_DCACHE_WB();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_DCACHE_WB
-
-    class Inst_SMEM__S_DCACHE_INV_VOL : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_DCACHE_INV_VOL(InFmt_SMEM*);
-        ~Inst_SMEM__S_DCACHE_INV_VOL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_DCACHE_INV_VOL
-
-    class Inst_SMEM__S_DCACHE_WB_VOL : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_DCACHE_WB_VOL(InFmt_SMEM*);
-        ~Inst_SMEM__S_DCACHE_WB_VOL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_DCACHE_WB_VOL
-
-    class Inst_SMEM__S_MEMTIME : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_MEMTIME(InFmt_SMEM*);
-        ~Inst_SMEM__S_MEMTIME();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_MEMTIME
-
-    class Inst_SMEM__S_MEMREALTIME : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_MEMREALTIME(InFmt_SMEM*);
-        ~Inst_SMEM__S_MEMREALTIME();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_MEMREALTIME
-
-    class Inst_SMEM__S_ATC_PROBE : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_ATC_PROBE(InFmt_SMEM*);
-        ~Inst_SMEM__S_ATC_PROBE();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //perm_rwx
-                return 32;
-              case 1: //sgpr_base
-                return 8;
-              case 2: //offset
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_ATC_PROBE
-
-    class Inst_SMEM__S_ATC_PROBE_BUFFER : public Inst_SMEM
-    {
-      public:
-        Inst_SMEM__S_ATC_PROBE_BUFFER(InFmt_SMEM*);
-        ~Inst_SMEM__S_ATC_PROBE_BUFFER();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //perm_rwx
-                return 32;
-              case 1: //sgpr_base
-                return 16;
-              case 2: //offset
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_SMEM__S_ATC_PROBE_BUFFER
-
-    class Inst_VOP2__V_CNDMASK_B32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2*);
-        ~Inst_VOP2__V_CNDMASK_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_CNDMASK_B32
-
-    class Inst_VOP2__V_ADD_F32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_ADD_F32(InFmt_VOP2*);
-        ~Inst_VOP2__V_ADD_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_ADD_F32
-
-    class Inst_VOP2__V_SUB_F32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_SUB_F32(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUB_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUB_F32
-
-    class Inst_VOP2__V_SUBREV_F32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_SUBREV_F32(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUBREV_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUBREV_F32
-
-    class Inst_VOP2__V_MUL_LEGACY_F32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2*);
-        ~Inst_VOP2__V_MUL_LEGACY_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MUL_LEGACY_F32
-
-    class Inst_VOP2__V_MUL_F32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MUL_F32(InFmt_VOP2*);
-        ~Inst_VOP2__V_MUL_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MUL_F32
-
-    class Inst_VOP2__V_MUL_I32_I24 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2*);
-        ~Inst_VOP2__V_MUL_I32_I24();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MUL_I32_I24
-
-    class Inst_VOP2__V_MUL_HI_I32_I24 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2*);
-        ~Inst_VOP2__V_MUL_HI_I32_I24();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MUL_HI_I32_I24
-
-    class Inst_VOP2__V_MUL_U32_U24 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2*);
-        ~Inst_VOP2__V_MUL_U32_U24();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MUL_U32_U24
-
-    class Inst_VOP2__V_MUL_HI_U32_U24 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2*);
-        ~Inst_VOP2__V_MUL_HI_U32_U24();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MUL_HI_U32_U24
-
-    class Inst_VOP2__V_MIN_F32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MIN_F32(InFmt_VOP2*);
-        ~Inst_VOP2__V_MIN_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MIN_F32
-
-    class Inst_VOP2__V_MAX_F32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MAX_F32(InFmt_VOP2*);
-        ~Inst_VOP2__V_MAX_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MAX_F32
-
-    class Inst_VOP2__V_MIN_I32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MIN_I32(InFmt_VOP2*);
-        ~Inst_VOP2__V_MIN_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MIN_I32
-
-    class Inst_VOP2__V_MAX_I32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MAX_I32(InFmt_VOP2*);
-        ~Inst_VOP2__V_MAX_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MAX_I32
-
-    class Inst_VOP2__V_MIN_U32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MIN_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_MIN_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MIN_U32
-
-    class Inst_VOP2__V_MAX_U32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MAX_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_MAX_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MAX_U32
-
-    class Inst_VOP2__V_LSHRREV_B32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2*);
-        ~Inst_VOP2__V_LSHRREV_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_LSHRREV_B32
-
-    class Inst_VOP2__V_ASHRREV_I32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2*);
-        ~Inst_VOP2__V_ASHRREV_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_ASHRREV_I32
-
-    class Inst_VOP2__V_LSHLREV_B32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2*);
-        ~Inst_VOP2__V_LSHLREV_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_LSHLREV_B32
-
-    class Inst_VOP2__V_AND_B32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_AND_B32(InFmt_VOP2*);
-        ~Inst_VOP2__V_AND_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_AND_B32
-
-    class Inst_VOP2__V_OR_B32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_OR_B32(InFmt_VOP2*);
-        ~Inst_VOP2__V_OR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_OR_B32
-
-    class Inst_VOP2__V_XOR_B32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_XOR_B32(InFmt_VOP2*);
-        ~Inst_VOP2__V_XOR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_XOR_B32
-
-    class Inst_VOP2__V_MAC_F32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MAC_F32(InFmt_VOP2*);
-        ~Inst_VOP2__V_MAC_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MAC_F32
-
-    class Inst_VOP2__V_MADMK_F32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MADMK_F32(InFmt_VOP2*);
-        ~Inst_VOP2__V_MADMK_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MADMK_F32
-
-    class Inst_VOP2__V_MADAK_F32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MADAK_F32(InFmt_VOP2*);
-        ~Inst_VOP2__V_MADAK_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MADAK_F32
-
-    class Inst_VOP2__V_ADD_U32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_ADD_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_ADD_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              case 3: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_ADD_U32
-
-    class Inst_VOP2__V_SUB_U32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_SUB_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUB_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              case 3: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUB_U32
-
-    class Inst_VOP2__V_SUBREV_U32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_SUBREV_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUBREV_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              case 3: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUBREV_U32
-
-    class Inst_VOP2__V_ADDC_U32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_ADDC_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_ADDC_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              case 3: //vdst
-                return 4;
-              case 4: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_ADDC_U32
-
-    class Inst_VOP2__V_SUBB_U32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_SUBB_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUBB_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              case 3: //vdst
-                return 4;
-              case 4: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUBB_U32
-
-    class Inst_VOP2__V_SUBBREV_U32 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_SUBBREV_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUBBREV_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              case 3: //vdst
-                return 4;
-              case 4: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUBBREV_U32
-
-    class Inst_VOP2__V_ADD_F16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_ADD_F16(InFmt_VOP2*);
-        ~Inst_VOP2__V_ADD_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_ADD_F16
-
-    class Inst_VOP2__V_SUB_F16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_SUB_F16(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUB_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUB_F16
-
-    class Inst_VOP2__V_SUBREV_F16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_SUBREV_F16(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUBREV_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUBREV_F16
-
-    class Inst_VOP2__V_MUL_F16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MUL_F16(InFmt_VOP2*);
-        ~Inst_VOP2__V_MUL_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MUL_F16
-
-    class Inst_VOP2__V_MAC_F16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MAC_F16(InFmt_VOP2*);
-        ~Inst_VOP2__V_MAC_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MAC_F16
-
-    class Inst_VOP2__V_MADMK_F16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MADMK_F16(InFmt_VOP2*);
-        ~Inst_VOP2__V_MADMK_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //src_2
-                return 2;
-              case 3: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MADMK_F16
-
-    class Inst_VOP2__V_MADAK_F16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MADAK_F16(InFmt_VOP2*);
-        ~Inst_VOP2__V_MADAK_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //src_2
-                return 2;
-              case 3: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MADAK_F16
-
-    class Inst_VOP2__V_ADD_U16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_ADD_U16(InFmt_VOP2*);
-        ~Inst_VOP2__V_ADD_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_ADD_U16
-
-    class Inst_VOP2__V_SUB_U16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_SUB_U16(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUB_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUB_U16
-
-    class Inst_VOP2__V_SUBREV_U16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_SUBREV_U16(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUBREV_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUBREV_U16
-
-    class Inst_VOP2__V_MUL_LO_U16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2*);
-        ~Inst_VOP2__V_MUL_LO_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MUL_LO_U16
-
-    class Inst_VOP2__V_LSHLREV_B16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2*);
-        ~Inst_VOP2__V_LSHLREV_B16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_LSHLREV_B16
-
-    class Inst_VOP2__V_LSHRREV_B16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2*);
-        ~Inst_VOP2__V_LSHRREV_B16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_LSHRREV_B16
-
-    class Inst_VOP2__V_ASHRREV_I16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2*);
-        ~Inst_VOP2__V_ASHRREV_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_ASHRREV_I16
-
-    class Inst_VOP2__V_MAX_F16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MAX_F16(InFmt_VOP2*);
-        ~Inst_VOP2__V_MAX_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MAX_F16
-
-    class Inst_VOP2__V_MIN_F16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MIN_F16(InFmt_VOP2*);
-        ~Inst_VOP2__V_MIN_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MIN_F16
-
-    class Inst_VOP2__V_MAX_U16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MAX_U16(InFmt_VOP2*);
-        ~Inst_VOP2__V_MAX_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MAX_U16
-
-    class Inst_VOP2__V_MAX_I16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MAX_I16(InFmt_VOP2*);
-        ~Inst_VOP2__V_MAX_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MAX_I16
-
-    class Inst_VOP2__V_MIN_U16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MIN_U16(InFmt_VOP2*);
-        ~Inst_VOP2__V_MIN_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MIN_U16
-
-    class Inst_VOP2__V_MIN_I16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_MIN_I16(InFmt_VOP2*);
-        ~Inst_VOP2__V_MIN_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_MIN_I16
-
-    class Inst_VOP2__V_LDEXP_F16 : public Inst_VOP2
-    {
-      public:
-        Inst_VOP2__V_LDEXP_F16(InFmt_VOP2*);
-        ~Inst_VOP2__V_LDEXP_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_LDEXP_F16
-
-    class Inst_VOP1__V_NOP : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_NOP(InFmt_VOP1*);
-        ~Inst_VOP1__V_NOP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_NOP
-
-    class Inst_VOP1__V_MOV_B32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_MOV_B32(InFmt_VOP1*);
-        ~Inst_VOP1__V_MOV_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_MOV_B32
-
-    class Inst_VOP1__V_READFIRSTLANE_B32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_READFIRSTLANE_B32(InFmt_VOP1*);
-        ~Inst_VOP1__V_READFIRSTLANE_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vsrc
-                return 4;
-              case 1: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_READFIRSTLANE_B32
-
-    class Inst_VOP1__V_CVT_I32_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_I32_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_I32_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_I32_F64
-
-    class Inst_VOP1__V_CVT_F64_I32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F64_I32(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F64_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F64_I32
-
-    class Inst_VOP1__V_CVT_F32_I32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F32_I32(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F32_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F32_I32
-
-    class Inst_VOP1__V_CVT_F32_U32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F32_U32(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F32_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F32_U32
-
-    class Inst_VOP1__V_CVT_U32_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_U32_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_U32_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_U32_F32
-
-    class Inst_VOP1__V_CVT_I32_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_I32_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_I32_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_I32_F32
-
-    class Inst_VOP1__V_MOV_FED_B32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_MOV_FED_B32(InFmt_VOP1*);
-        ~Inst_VOP1__V_MOV_FED_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_MOV_FED_B32
-
-    class Inst_VOP1__V_CVT_F16_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F16_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F16_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F16_F32
-
-    class Inst_VOP1__V_CVT_F32_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F32_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F32_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F32_F16
-
-    class Inst_VOP1__V_CVT_RPI_I32_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_RPI_I32_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_RPI_I32_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_RPI_I32_F32
-
-    class Inst_VOP1__V_CVT_FLR_I32_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_FLR_I32_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_FLR_I32_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_FLR_I32_F32
-
-    class Inst_VOP1__V_CVT_OFF_F32_I4 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_OFF_F32_I4(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_OFF_F32_I4();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_OFF_F32_I4
-
-    class Inst_VOP1__V_CVT_F32_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F32_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F32_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F32_F64
-
-    class Inst_VOP1__V_CVT_F64_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F64_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F64_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F64_F32
-
-    class Inst_VOP1__V_CVT_F32_UBYTE0 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F32_UBYTE0(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F32_UBYTE0();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F32_UBYTE0
-
-    class Inst_VOP1__V_CVT_F32_UBYTE1 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F32_UBYTE1(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F32_UBYTE1();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F32_UBYTE1
-
-    class Inst_VOP1__V_CVT_F32_UBYTE2 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F32_UBYTE2(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F32_UBYTE2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F32_UBYTE2
-
-    class Inst_VOP1__V_CVT_F32_UBYTE3 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F32_UBYTE3(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F32_UBYTE3();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F32_UBYTE3
-
-    class Inst_VOP1__V_CVT_U32_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_U32_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_U32_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_U32_F64
-
-    class Inst_VOP1__V_CVT_F64_U32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F64_U32(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F64_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F64_U32
-
-    class Inst_VOP1__V_TRUNC_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_TRUNC_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_TRUNC_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_TRUNC_F64
-
-    class Inst_VOP1__V_CEIL_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CEIL_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_CEIL_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CEIL_F64
-
-    class Inst_VOP1__V_RNDNE_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_RNDNE_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_RNDNE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_RNDNE_F64
-
-    class Inst_VOP1__V_FLOOR_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FLOOR_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_FLOOR_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FLOOR_F64
-
-    class Inst_VOP1__V_FRACT_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FRACT_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_FRACT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FRACT_F32
-
-    class Inst_VOP1__V_TRUNC_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_TRUNC_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_TRUNC_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_TRUNC_F32
-
-    class Inst_VOP1__V_CEIL_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CEIL_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_CEIL_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CEIL_F32
-
-    class Inst_VOP1__V_RNDNE_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_RNDNE_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_RNDNE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_RNDNE_F32
-
-    class Inst_VOP1__V_FLOOR_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FLOOR_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_FLOOR_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FLOOR_F32
-
-    class Inst_VOP1__V_EXP_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_EXP_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_EXP_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_EXP_F32
-
-    class Inst_VOP1__V_LOG_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_LOG_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_LOG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_LOG_F32
-
-    class Inst_VOP1__V_RCP_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_RCP_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_RCP_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_RCP_F32
-
-    class Inst_VOP1__V_RCP_IFLAG_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_RCP_IFLAG_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_RCP_IFLAG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_RCP_IFLAG_F32
-
-    class Inst_VOP1__V_RSQ_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_RSQ_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_RSQ_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_RSQ_F32
-
-    class Inst_VOP1__V_RCP_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_RCP_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_RCP_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_RCP_F64
-
-    class Inst_VOP1__V_RSQ_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_RSQ_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_RSQ_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_RSQ_F64
-
-    class Inst_VOP1__V_SQRT_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_SQRT_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_SQRT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_SQRT_F32
-
-    class Inst_VOP1__V_SQRT_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_SQRT_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_SQRT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_SQRT_F64
-
-    class Inst_VOP1__V_SIN_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_SIN_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_SIN_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_SIN_F32
-
-    class Inst_VOP1__V_COS_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_COS_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_COS_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_COS_F32
-
-    class Inst_VOP1__V_NOT_B32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_NOT_B32(InFmt_VOP1*);
-        ~Inst_VOP1__V_NOT_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_NOT_B32
-
-    class Inst_VOP1__V_BFREV_B32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_BFREV_B32(InFmt_VOP1*);
-        ~Inst_VOP1__V_BFREV_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_BFREV_B32
-
-    class Inst_VOP1__V_FFBH_U32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FFBH_U32(InFmt_VOP1*);
-        ~Inst_VOP1__V_FFBH_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FFBH_U32
-
-    class Inst_VOP1__V_FFBL_B32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FFBL_B32(InFmt_VOP1*);
-        ~Inst_VOP1__V_FFBL_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FFBL_B32
-
-    class Inst_VOP1__V_FFBH_I32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FFBH_I32(InFmt_VOP1*);
-        ~Inst_VOP1__V_FFBH_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FFBH_I32
-
-    class Inst_VOP1__V_FREXP_EXP_I32_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FREXP_EXP_I32_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_FREXP_EXP_I32_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FREXP_EXP_I32_F64
-
-    class Inst_VOP1__V_FREXP_MANT_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FREXP_MANT_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_FREXP_MANT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FREXP_MANT_F64
-
-    class Inst_VOP1__V_FRACT_F64 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FRACT_F64(InFmt_VOP1*);
-        ~Inst_VOP1__V_FRACT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FRACT_F64
-
-    class Inst_VOP1__V_FREXP_EXP_I32_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FREXP_EXP_I32_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_FREXP_EXP_I32_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FREXP_EXP_I32_F32
-
-    class Inst_VOP1__V_FREXP_MANT_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FREXP_MANT_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_FREXP_MANT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FREXP_MANT_F32
-
-    class Inst_VOP1__V_CLREXCP : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CLREXCP(InFmt_VOP1*);
-        ~Inst_VOP1__V_CLREXCP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CLREXCP
-
-    class Inst_VOP1__V_CVT_F16_U16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F16_U16(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F16_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F16_U16
-
-    class Inst_VOP1__V_CVT_F16_I16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_F16_I16(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_F16_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_F16_I16
-
-    class Inst_VOP1__V_CVT_U16_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_U16_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_U16_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_U16_F16
-
-    class Inst_VOP1__V_CVT_I16_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CVT_I16_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_CVT_I16_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CVT_I16_F16
-
-    class Inst_VOP1__V_RCP_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_RCP_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_RCP_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_RCP_F16
-
-    class Inst_VOP1__V_SQRT_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_SQRT_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_SQRT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_SQRT_F16
-
-    class Inst_VOP1__V_RSQ_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_RSQ_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_RSQ_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_RSQ_F16
-
-    class Inst_VOP1__V_LOG_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_LOG_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_LOG_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_LOG_F16
-
-    class Inst_VOP1__V_EXP_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_EXP_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_EXP_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_EXP_F16
-
-    class Inst_VOP1__V_FREXP_MANT_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FREXP_MANT_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_FREXP_MANT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FREXP_MANT_F16
-
-    class Inst_VOP1__V_FREXP_EXP_I16_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FREXP_EXP_I16_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_FREXP_EXP_I16_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FREXP_EXP_I16_F16
-
-    class Inst_VOP1__V_FLOOR_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FLOOR_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_FLOOR_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FLOOR_F16
-
-    class Inst_VOP1__V_CEIL_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_CEIL_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_CEIL_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_CEIL_F16
-
-    class Inst_VOP1__V_TRUNC_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_TRUNC_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_TRUNC_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_TRUNC_F16
-
-    class Inst_VOP1__V_RNDNE_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_RNDNE_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_RNDNE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_RNDNE_F16
-
-    class Inst_VOP1__V_FRACT_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_FRACT_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_FRACT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_FRACT_F16
-
-    class Inst_VOP1__V_SIN_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_SIN_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_SIN_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_SIN_F16
-
-    class Inst_VOP1__V_COS_F16 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_COS_F16(InFmt_VOP1*);
-        ~Inst_VOP1__V_COS_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_COS_F16
-
-    class Inst_VOP1__V_EXP_LEGACY_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_EXP_LEGACY_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_EXP_LEGACY_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_EXP_LEGACY_F32
-
-    class Inst_VOP1__V_LOG_LEGACY_F32 : public Inst_VOP1
-    {
-      public:
-        Inst_VOP1__V_LOG_LEGACY_F32(InFmt_VOP1*);
-        ~Inst_VOP1__V_LOG_LEGACY_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP1__V_LOG_LEGACY_F32
-
-    class Inst_VOPC__V_CMP_CLASS_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_CLASS_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_CLASS_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_CLASS_F32
-
-    class Inst_VOPC__V_CMPX_CLASS_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_CLASS_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_CLASS_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_CLASS_F32
-
-    class Inst_VOPC__V_CMP_CLASS_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_CLASS_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_CLASS_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_CLASS_F64
-
-    class Inst_VOPC__V_CMPX_CLASS_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_CLASS_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_CLASS_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_CLASS_F64
-
-    class Inst_VOPC__V_CMP_CLASS_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_CLASS_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_CLASS_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_CLASS_F16
-
-    class Inst_VOPC__V_CMPX_CLASS_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_CLASS_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_CLASS_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_CLASS_F16
-
-    class Inst_VOPC__V_CMP_F_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_F_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_F_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_F_F16
-
-    class Inst_VOPC__V_CMP_LT_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LT_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LT_F16
-
-    class Inst_VOPC__V_CMP_EQ_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_EQ_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_EQ_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_EQ_F16
-
-    class Inst_VOPC__V_CMP_LE_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LE_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LE_F16
-
-    class Inst_VOPC__V_CMP_GT_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GT_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GT_F16
-
-    class Inst_VOPC__V_CMP_LG_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LG_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LG_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LG_F16
-
-    class Inst_VOPC__V_CMP_GE_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GE_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GE_F16
-
-    class Inst_VOPC__V_CMP_O_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_O_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_O_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_O_F16
-
-    class Inst_VOPC__V_CMP_U_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_U_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_U_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_U_F16
-
-    class Inst_VOPC__V_CMP_NGE_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NGE_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NGE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NGE_F16
-
-    class Inst_VOPC__V_CMP_NLG_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NLG_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NLG_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NLG_F16
-
-    class Inst_VOPC__V_CMP_NGT_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NGT_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NGT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NGT_F16
-
-    class Inst_VOPC__V_CMP_NLE_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NLE_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NLE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NLE_F16
-
-    class Inst_VOPC__V_CMP_NEQ_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NEQ_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NEQ_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NEQ_F16
-
-    class Inst_VOPC__V_CMP_NLT_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NLT_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NLT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NLT_F16
-
-    class Inst_VOPC__V_CMP_TRU_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_TRU_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_TRU_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_TRU_F16
-
-    class Inst_VOPC__V_CMPX_F_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_F_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_F_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_F_F16
-
-    class Inst_VOPC__V_CMPX_LT_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LT_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LT_F16
-
-    class Inst_VOPC__V_CMPX_EQ_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_EQ_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_EQ_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_EQ_F16
-
-    class Inst_VOPC__V_CMPX_LE_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LE_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LE_F16
-
-    class Inst_VOPC__V_CMPX_GT_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GT_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GT_F16
-
-    class Inst_VOPC__V_CMPX_LG_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LG_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LG_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LG_F16
-
-    class Inst_VOPC__V_CMPX_GE_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GE_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GE_F16
-
-    class Inst_VOPC__V_CMPX_O_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_O_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_O_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_O_F16
-
-    class Inst_VOPC__V_CMPX_U_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_U_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_U_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_U_F16
-
-    class Inst_VOPC__V_CMPX_NGE_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NGE_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NGE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NGE_F16
-
-    class Inst_VOPC__V_CMPX_NLG_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NLG_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NLG_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NLG_F16
-
-    class Inst_VOPC__V_CMPX_NGT_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NGT_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NGT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NGT_F16
-
-    class Inst_VOPC__V_CMPX_NLE_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NLE_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NLE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NLE_F16
-
-    class Inst_VOPC__V_CMPX_NEQ_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NEQ_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NEQ_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NEQ_F16
-
-    class Inst_VOPC__V_CMPX_NLT_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NLT_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NLT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NLT_F16
-
-    class Inst_VOPC__V_CMPX_TRU_F16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_TRU_F16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_TRU_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_TRU_F16
-
-    class Inst_VOPC__V_CMP_F_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_F_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_F_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_F_F32
-
-    class Inst_VOPC__V_CMP_LT_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LT_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LT_F32
-
-    class Inst_VOPC__V_CMP_EQ_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_EQ_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_EQ_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_EQ_F32
-
-    class Inst_VOPC__V_CMP_LE_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LE_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LE_F32
-
-    class Inst_VOPC__V_CMP_GT_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GT_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GT_F32
-
-    class Inst_VOPC__V_CMP_LG_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LG_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LG_F32
-
-    class Inst_VOPC__V_CMP_GE_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GE_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GE_F32
-
-    class Inst_VOPC__V_CMP_O_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_O_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_O_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_O_F32
-
-    class Inst_VOPC__V_CMP_U_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_U_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_U_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_U_F32
-
-    class Inst_VOPC__V_CMP_NGE_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NGE_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NGE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NGE_F32
-
-    class Inst_VOPC__V_CMP_NLG_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NLG_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NLG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NLG_F32
-
-    class Inst_VOPC__V_CMP_NGT_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NGT_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NGT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NGT_F32
-
-    class Inst_VOPC__V_CMP_NLE_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NLE_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NLE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NLE_F32
-
-    class Inst_VOPC__V_CMP_NEQ_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NEQ_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NEQ_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NEQ_F32
-
-    class Inst_VOPC__V_CMP_NLT_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NLT_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NLT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NLT_F32
-
-    class Inst_VOPC__V_CMP_TRU_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_TRU_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_TRU_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_TRU_F32
-
-    class Inst_VOPC__V_CMPX_F_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_F_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_F_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_F_F32
-
-    class Inst_VOPC__V_CMPX_LT_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LT_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LT_F32
-
-    class Inst_VOPC__V_CMPX_EQ_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_EQ_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_EQ_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_EQ_F32
-
-    class Inst_VOPC__V_CMPX_LE_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LE_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LE_F32
-
-    class Inst_VOPC__V_CMPX_GT_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GT_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GT_F32
-
-    class Inst_VOPC__V_CMPX_LG_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LG_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LG_F32
-
-    class Inst_VOPC__V_CMPX_GE_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GE_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GE_F32
-
-    class Inst_VOPC__V_CMPX_O_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_O_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_O_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_O_F32
-
-    class Inst_VOPC__V_CMPX_U_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_U_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_U_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_U_F32
-
-    class Inst_VOPC__V_CMPX_NGE_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NGE_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NGE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NGE_F32
-
-    class Inst_VOPC__V_CMPX_NLG_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NLG_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NLG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NLG_F32
-
-    class Inst_VOPC__V_CMPX_NGT_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NGT_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NGT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NGT_F32
-
-    class Inst_VOPC__V_CMPX_NLE_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NLE_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NLE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NLE_F32
-
-    class Inst_VOPC__V_CMPX_NEQ_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NEQ_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NEQ_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NEQ_F32
-
-    class Inst_VOPC__V_CMPX_NLT_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NLT_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NLT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NLT_F32
-
-    class Inst_VOPC__V_CMPX_TRU_F32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_TRU_F32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_TRU_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_TRU_F32
-
-    class Inst_VOPC__V_CMP_F_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_F_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_F_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_F_F64
-
-    class Inst_VOPC__V_CMP_LT_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LT_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LT_F64
-
-    class Inst_VOPC__V_CMP_EQ_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_EQ_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_EQ_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_EQ_F64
-
-    class Inst_VOPC__V_CMP_LE_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LE_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LE_F64
-
-    class Inst_VOPC__V_CMP_GT_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GT_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GT_F64
-
-    class Inst_VOPC__V_CMP_LG_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LG_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LG_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LG_F64
-
-    class Inst_VOPC__V_CMP_GE_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GE_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GE_F64
-
-    class Inst_VOPC__V_CMP_O_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_O_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_O_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_O_F64
-
-    class Inst_VOPC__V_CMP_U_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_U_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_U_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_U_F64
-
-    class Inst_VOPC__V_CMP_NGE_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NGE_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NGE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NGE_F64
-
-    class Inst_VOPC__V_CMP_NLG_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NLG_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NLG_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NLG_F64
-
-    class Inst_VOPC__V_CMP_NGT_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NGT_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NGT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NGT_F64
-
-    class Inst_VOPC__V_CMP_NLE_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NLE_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NLE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NLE_F64
-
-    class Inst_VOPC__V_CMP_NEQ_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NEQ_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NEQ_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NEQ_F64
-
-    class Inst_VOPC__V_CMP_NLT_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NLT_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NLT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NLT_F64
-
-    class Inst_VOPC__V_CMP_TRU_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_TRU_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_TRU_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_TRU_F64
-
-    class Inst_VOPC__V_CMPX_F_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_F_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_F_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_F_F64
-
-    class Inst_VOPC__V_CMPX_LT_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LT_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LT_F64
-
-    class Inst_VOPC__V_CMPX_EQ_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_EQ_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_EQ_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_EQ_F64
-
-    class Inst_VOPC__V_CMPX_LE_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LE_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LE_F64
-
-    class Inst_VOPC__V_CMPX_GT_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GT_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GT_F64
-
-    class Inst_VOPC__V_CMPX_LG_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LG_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LG_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LG_F64
-
-    class Inst_VOPC__V_CMPX_GE_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GE_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GE_F64
-
-    class Inst_VOPC__V_CMPX_O_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_O_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_O_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_O_F64
-
-    class Inst_VOPC__V_CMPX_U_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_U_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_U_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_U_F64
-
-    class Inst_VOPC__V_CMPX_NGE_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NGE_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NGE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NGE_F64
-
-    class Inst_VOPC__V_CMPX_NLG_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NLG_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NLG_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NLG_F64
-
-    class Inst_VOPC__V_CMPX_NGT_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NGT_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NGT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NGT_F64
-
-    class Inst_VOPC__V_CMPX_NLE_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NLE_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NLE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NLE_F64
-
-    class Inst_VOPC__V_CMPX_NEQ_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NEQ_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NEQ_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NEQ_F64
-
-    class Inst_VOPC__V_CMPX_NLT_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NLT_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NLT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NLT_F64
-
-    class Inst_VOPC__V_CMPX_TRU_F64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_TRU_F64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_TRU_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_TRU_F64
-
-    class Inst_VOPC__V_CMP_F_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_F_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_F_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_F_I16
-
-    class Inst_VOPC__V_CMP_LT_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LT_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LT_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LT_I16
-
-    class Inst_VOPC__V_CMP_EQ_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_EQ_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_EQ_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_EQ_I16
-
-    class Inst_VOPC__V_CMP_LE_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LE_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LE_I16
-
-    class Inst_VOPC__V_CMP_GT_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GT_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GT_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GT_I16
-
-    class Inst_VOPC__V_CMP_NE_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NE_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NE_I16
-
-    class Inst_VOPC__V_CMP_GE_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GE_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GE_I16
-
-    class Inst_VOPC__V_CMP_T_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_T_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_T_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_T_I16
-
-    class Inst_VOPC__V_CMP_F_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_F_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_F_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_F_U16
-
-    class Inst_VOPC__V_CMP_LT_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LT_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LT_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LT_U16
-
-    class Inst_VOPC__V_CMP_EQ_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_EQ_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_EQ_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_EQ_U16
-
-    class Inst_VOPC__V_CMP_LE_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LE_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LE_U16
-
-    class Inst_VOPC__V_CMP_GT_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GT_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GT_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GT_U16
-
-    class Inst_VOPC__V_CMP_NE_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NE_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NE_U16
-
-    class Inst_VOPC__V_CMP_GE_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GE_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GE_U16
-
-    class Inst_VOPC__V_CMP_T_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_T_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_T_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_T_U16
-
-    class Inst_VOPC__V_CMPX_F_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_F_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_F_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_F_I16
-
-    class Inst_VOPC__V_CMPX_LT_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LT_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LT_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LT_I16
-
-    class Inst_VOPC__V_CMPX_EQ_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_EQ_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_EQ_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_EQ_I16
-
-    class Inst_VOPC__V_CMPX_LE_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LE_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LE_I16
-
-    class Inst_VOPC__V_CMPX_GT_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GT_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GT_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GT_I16
-
-    class Inst_VOPC__V_CMPX_NE_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NE_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NE_I16
-
-    class Inst_VOPC__V_CMPX_GE_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GE_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GE_I16
-
-    class Inst_VOPC__V_CMPX_T_I16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_T_I16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_T_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_T_I16
-
-    class Inst_VOPC__V_CMPX_F_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_F_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_F_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_F_U16
-
-    class Inst_VOPC__V_CMPX_LT_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LT_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LT_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LT_U16
-
-    class Inst_VOPC__V_CMPX_EQ_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_EQ_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_EQ_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_EQ_U16
-
-    class Inst_VOPC__V_CMPX_LE_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LE_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LE_U16
-
-    class Inst_VOPC__V_CMPX_GT_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GT_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GT_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GT_U16
-
-    class Inst_VOPC__V_CMPX_NE_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NE_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NE_U16
-
-    class Inst_VOPC__V_CMPX_GE_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GE_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GE_U16
-
-    class Inst_VOPC__V_CMPX_T_U16 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_T_U16(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_T_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_T_U16
-
-    class Inst_VOPC__V_CMP_F_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_F_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_F_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_F_I32
-
-    class Inst_VOPC__V_CMP_LT_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LT_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LT_I32
-
-    class Inst_VOPC__V_CMP_EQ_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_EQ_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_EQ_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_EQ_I32
-
-    class Inst_VOPC__V_CMP_LE_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LE_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LE_I32
-
-    class Inst_VOPC__V_CMP_GT_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GT_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GT_I32
-
-    class Inst_VOPC__V_CMP_NE_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NE_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NE_I32
-
-    class Inst_VOPC__V_CMP_GE_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GE_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GE_I32
-
-    class Inst_VOPC__V_CMP_T_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_T_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_T_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_T_I32
-
-    class Inst_VOPC__V_CMP_F_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_F_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_F_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_F_U32
-
-    class Inst_VOPC__V_CMP_LT_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LT_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LT_U32
-
-    class Inst_VOPC__V_CMP_EQ_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_EQ_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_EQ_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_EQ_U32
-
-    class Inst_VOPC__V_CMP_LE_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LE_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LE_U32
-
-    class Inst_VOPC__V_CMP_GT_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GT_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GT_U32
-
-    class Inst_VOPC__V_CMP_NE_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NE_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NE_U32
-
-    class Inst_VOPC__V_CMP_GE_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GE_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GE_U32
-
-    class Inst_VOPC__V_CMP_T_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_T_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_T_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_T_U32
-
-    class Inst_VOPC__V_CMPX_F_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_F_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_F_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_F_I32
-
-    class Inst_VOPC__V_CMPX_LT_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LT_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LT_I32
-
-    class Inst_VOPC__V_CMPX_EQ_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_EQ_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_EQ_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_EQ_I32
-
-    class Inst_VOPC__V_CMPX_LE_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LE_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LE_I32
-
-    class Inst_VOPC__V_CMPX_GT_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GT_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GT_I32
-
-    class Inst_VOPC__V_CMPX_NE_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NE_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NE_I32
-
-    class Inst_VOPC__V_CMPX_GE_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GE_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GE_I32
-
-    class Inst_VOPC__V_CMPX_T_I32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_T_I32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_T_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_T_I32
-
-    class Inst_VOPC__V_CMPX_F_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_F_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_F_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_F_U32
-
-    class Inst_VOPC__V_CMPX_LT_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LT_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LT_U32
-
-    class Inst_VOPC__V_CMPX_EQ_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_EQ_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_EQ_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_EQ_U32
-
-    class Inst_VOPC__V_CMPX_LE_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LE_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LE_U32
-
-    class Inst_VOPC__V_CMPX_GT_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GT_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GT_U32
-
-    class Inst_VOPC__V_CMPX_NE_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NE_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NE_U32
-
-    class Inst_VOPC__V_CMPX_GE_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GE_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GE_U32
-
-    class Inst_VOPC__V_CMPX_T_U32 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_T_U32(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_T_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_T_U32
-
-    class Inst_VOPC__V_CMP_F_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_F_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_F_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_F_I64
-
-    class Inst_VOPC__V_CMP_LT_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LT_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LT_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LT_I64
-
-    class Inst_VOPC__V_CMP_EQ_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_EQ_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_EQ_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_EQ_I64
-
-    class Inst_VOPC__V_CMP_LE_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LE_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LE_I64
-
-    class Inst_VOPC__V_CMP_GT_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GT_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GT_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GT_I64
-
-    class Inst_VOPC__V_CMP_NE_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NE_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NE_I64
-
-    class Inst_VOPC__V_CMP_GE_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GE_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GE_I64
-
-    class Inst_VOPC__V_CMP_T_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_T_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_T_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_T_I64
-
-    class Inst_VOPC__V_CMP_F_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_F_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_F_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_F_U64
-
-    class Inst_VOPC__V_CMP_LT_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LT_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LT_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LT_U64
-
-    class Inst_VOPC__V_CMP_EQ_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_EQ_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_EQ_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_EQ_U64
-
-    class Inst_VOPC__V_CMP_LE_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_LE_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_LE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_LE_U64
-
-    class Inst_VOPC__V_CMP_GT_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GT_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GT_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GT_U64
-
-    class Inst_VOPC__V_CMP_NE_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_NE_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_NE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_NE_U64
-
-    class Inst_VOPC__V_CMP_GE_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_GE_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_GE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_GE_U64
-
-    class Inst_VOPC__V_CMP_T_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMP_T_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMP_T_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMP_T_U64
-
-    class Inst_VOPC__V_CMPX_F_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_F_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_F_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_F_I64
-
-    class Inst_VOPC__V_CMPX_LT_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LT_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LT_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LT_I64
-
-    class Inst_VOPC__V_CMPX_EQ_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_EQ_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_EQ_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_EQ_I64
-
-    class Inst_VOPC__V_CMPX_LE_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LE_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LE_I64
-
-    class Inst_VOPC__V_CMPX_GT_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GT_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GT_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GT_I64
-
-    class Inst_VOPC__V_CMPX_NE_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NE_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NE_I64
-
-    class Inst_VOPC__V_CMPX_GE_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GE_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GE_I64
-
-    class Inst_VOPC__V_CMPX_T_I64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_T_I64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_T_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_T_I64
-
-    class Inst_VOPC__V_CMPX_F_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_F_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_F_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_F_U64
-
-    class Inst_VOPC__V_CMPX_LT_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LT_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LT_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LT_U64
-
-    class Inst_VOPC__V_CMPX_EQ_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_EQ_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_EQ_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_EQ_U64
-
-    class Inst_VOPC__V_CMPX_LE_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_LE_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_LE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_LE_U64
-
-    class Inst_VOPC__V_CMPX_GT_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GT_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GT_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GT_U64
-
-    class Inst_VOPC__V_CMPX_NE_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_NE_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_NE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_NE_U64
-
-    class Inst_VOPC__V_CMPX_GE_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_GE_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_GE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_GE_U64
-
-    class Inst_VOPC__V_CMPX_T_U64 : public Inst_VOPC
-    {
-      public:
-        Inst_VOPC__V_CMPX_T_U64(InFmt_VOPC*);
-        ~Inst_VOPC__V_CMPX_T_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOPC__V_CMPX_T_U64
-
-    class Inst_VINTRP__V_INTERP_P1_F32 : public Inst_VINTRP
-    {
-      public:
-        Inst_VINTRP__V_INTERP_P1_F32(InFmt_VINTRP*);
-        ~Inst_VINTRP__V_INTERP_P1_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_ij
-                return 4;
-              case 1: //attr
-                return 16;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VINTRP__V_INTERP_P1_F32
-
-    class Inst_VINTRP__V_INTERP_P2_F32 : public Inst_VINTRP
-    {
-      public:
-        Inst_VINTRP__V_INTERP_P2_F32(InFmt_VINTRP*);
-        ~Inst_VINTRP__V_INTERP_P2_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_ij
-                return 4;
-              case 1: //attr
-                return 16;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VINTRP__V_INTERP_P2_F32
-
-    class Inst_VINTRP__V_INTERP_MOV_F32 : public Inst_VINTRP
-    {
-      public:
-        Inst_VINTRP__V_INTERP_MOV_F32(InFmt_VINTRP*);
-        ~Inst_VINTRP__V_INTERP_MOV_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //param
-                return 4;
-              case 1: //attr
-                return 16;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VINTRP__V_INTERP_MOV_F32
-
-    class Inst_VOP3__V_CMP_CLASS_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_CLASS_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_CLASS_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_CLASS_F32
-
-    class Inst_VOP3__V_CMPX_CLASS_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_CLASS_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_CLASS_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_CLASS_F32
-
-    class Inst_VOP3__V_CMP_CLASS_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_CLASS_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_CLASS_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_CLASS_F64
-
-    class Inst_VOP3__V_CMPX_CLASS_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_CLASS_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_CLASS_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_CLASS_F64
-
-    class Inst_VOP3__V_CMP_CLASS_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_CLASS_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_CLASS_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_CLASS_F16
-
-    class Inst_VOP3__V_CMPX_CLASS_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_CLASS_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_CLASS_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_CLASS_F16
-
-    class Inst_VOP3__V_CMP_F_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_F_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_F_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_F_F16
-
-    class Inst_VOP3__V_CMP_LT_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LT_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LT_F16
-
-    class Inst_VOP3__V_CMP_EQ_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_EQ_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_EQ_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_EQ_F16
-
-    class Inst_VOP3__V_CMP_LE_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LE_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LE_F16
-
-    class Inst_VOP3__V_CMP_GT_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GT_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GT_F16
-
-    class Inst_VOP3__V_CMP_LG_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LG_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LG_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LG_F16
-
-    class Inst_VOP3__V_CMP_GE_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GE_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GE_F16
-
-    class Inst_VOP3__V_CMP_O_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_O_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_O_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_O_F16
-
-    class Inst_VOP3__V_CMP_U_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_U_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_U_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_U_F16
-
-    class Inst_VOP3__V_CMP_NGE_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NGE_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NGE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NGE_F16
-
-    class Inst_VOP3__V_CMP_NLG_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NLG_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NLG_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NLG_F16
-
-    class Inst_VOP3__V_CMP_NGT_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NGT_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NGT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NGT_F16
-
-    class Inst_VOP3__V_CMP_NLE_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NLE_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NLE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NLE_F16
-
-    class Inst_VOP3__V_CMP_NEQ_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NEQ_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NEQ_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NEQ_F16
-
-    class Inst_VOP3__V_CMP_NLT_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NLT_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NLT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NLT_F16
-
-    class Inst_VOP3__V_CMP_TRU_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_TRU_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_TRU_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_TRU_F16
-
-    class Inst_VOP3__V_CMPX_F_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_F_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_F_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_F_F16
-
-    class Inst_VOP3__V_CMPX_LT_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LT_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LT_F16
-
-    class Inst_VOP3__V_CMPX_EQ_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_EQ_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_EQ_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_EQ_F16
-
-    class Inst_VOP3__V_CMPX_LE_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LE_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LE_F16
-
-    class Inst_VOP3__V_CMPX_GT_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GT_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GT_F16
-
-    class Inst_VOP3__V_CMPX_LG_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LG_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LG_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LG_F16
-
-    class Inst_VOP3__V_CMPX_GE_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GE_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GE_F16
-
-    class Inst_VOP3__V_CMPX_O_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_O_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_O_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_O_F16
-
-    class Inst_VOP3__V_CMPX_U_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_U_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_U_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_U_F16
-
-    class Inst_VOP3__V_CMPX_NGE_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NGE_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NGE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NGE_F16
-
-    class Inst_VOP3__V_CMPX_NLG_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NLG_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NLG_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NLG_F16
-
-    class Inst_VOP3__V_CMPX_NGT_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NGT_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NGT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NGT_F16
-
-    class Inst_VOP3__V_CMPX_NLE_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NLE_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NLE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NLE_F16
-
-    class Inst_VOP3__V_CMPX_NEQ_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NEQ_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NEQ_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NEQ_F16
-
-    class Inst_VOP3__V_CMPX_NLT_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NLT_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NLT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NLT_F16
-
-    class Inst_VOP3__V_CMPX_TRU_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_TRU_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_TRU_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_TRU_F16
-
-    class Inst_VOP3__V_CMP_F_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_F_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_F_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_F_F32
-
-    class Inst_VOP3__V_CMP_LT_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LT_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LT_F32
-
-    class Inst_VOP3__V_CMP_EQ_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_EQ_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_EQ_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_EQ_F32
-
-    class Inst_VOP3__V_CMP_LE_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LE_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LE_F32
-
-    class Inst_VOP3__V_CMP_GT_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GT_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GT_F32
-
-    class Inst_VOP3__V_CMP_LG_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LG_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LG_F32
-
-    class Inst_VOP3__V_CMP_GE_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GE_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GE_F32
-
-    class Inst_VOP3__V_CMP_O_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_O_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_O_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_O_F32
-
-    class Inst_VOP3__V_CMP_U_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_U_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_U_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_U_F32
-
-    class Inst_VOP3__V_CMP_NGE_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NGE_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NGE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NGE_F32
-
-    class Inst_VOP3__V_CMP_NLG_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NLG_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NLG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NLG_F32
-
-    class Inst_VOP3__V_CMP_NGT_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NGT_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NGT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NGT_F32
-
-    class Inst_VOP3__V_CMP_NLE_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NLE_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NLE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NLE_F32
-
-    class Inst_VOP3__V_CMP_NEQ_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NEQ_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NEQ_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NEQ_F32
-
-    class Inst_VOP3__V_CMP_NLT_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NLT_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NLT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NLT_F32
-
-    class Inst_VOP3__V_CMP_TRU_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_TRU_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_TRU_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_TRU_F32
-
-    class Inst_VOP3__V_CMPX_F_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_F_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_F_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_F_F32
-
-    class Inst_VOP3__V_CMPX_LT_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LT_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LT_F32
-
-    class Inst_VOP3__V_CMPX_EQ_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_EQ_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_EQ_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_EQ_F32
-
-    class Inst_VOP3__V_CMPX_LE_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LE_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LE_F32
-
-    class Inst_VOP3__V_CMPX_GT_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GT_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GT_F32
-
-    class Inst_VOP3__V_CMPX_LG_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LG_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LG_F32
-
-    class Inst_VOP3__V_CMPX_GE_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GE_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GE_F32
-
-    class Inst_VOP3__V_CMPX_O_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_O_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_O_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_O_F32
-
-    class Inst_VOP3__V_CMPX_U_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_U_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_U_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_U_F32
-
-    class Inst_VOP3__V_CMPX_NGE_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NGE_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NGE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NGE_F32
-
-    class Inst_VOP3__V_CMPX_NLG_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NLG_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NLG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NLG_F32
-
-    class Inst_VOP3__V_CMPX_NGT_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NGT_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NGT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NGT_F32
-
-    class Inst_VOP3__V_CMPX_NLE_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NLE_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NLE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NLE_F32
-
-    class Inst_VOP3__V_CMPX_NEQ_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NEQ_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NEQ_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NEQ_F32
-
-    class Inst_VOP3__V_CMPX_NLT_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NLT_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NLT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NLT_F32
-
-    class Inst_VOP3__V_CMPX_TRU_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_TRU_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_TRU_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_TRU_F32
-
-    class Inst_VOP3__V_CMP_F_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_F_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_F_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_F_F64
-
-    class Inst_VOP3__V_CMP_LT_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LT_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LT_F64
-
-    class Inst_VOP3__V_CMP_EQ_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_EQ_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_EQ_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_EQ_F64
-
-    class Inst_VOP3__V_CMP_LE_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LE_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LE_F64
-
-    class Inst_VOP3__V_CMP_GT_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GT_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GT_F64
-
-    class Inst_VOP3__V_CMP_LG_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LG_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LG_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LG_F64
-
-    class Inst_VOP3__V_CMP_GE_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GE_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GE_F64
-
-    class Inst_VOP3__V_CMP_O_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_O_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_O_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_O_F64
-
-    class Inst_VOP3__V_CMP_U_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_U_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_U_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_U_F64
-
-    class Inst_VOP3__V_CMP_NGE_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NGE_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NGE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NGE_F64
-
-    class Inst_VOP3__V_CMP_NLG_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NLG_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NLG_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NLG_F64
-
-    class Inst_VOP3__V_CMP_NGT_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NGT_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NGT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NGT_F64
-
-    class Inst_VOP3__V_CMP_NLE_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NLE_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NLE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NLE_F64
-
-    class Inst_VOP3__V_CMP_NEQ_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NEQ_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NEQ_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NEQ_F64
-
-    class Inst_VOP3__V_CMP_NLT_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NLT_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NLT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NLT_F64
-
-    class Inst_VOP3__V_CMP_TRU_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_TRU_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_TRU_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_TRU_F64
-
-    class Inst_VOP3__V_CMPX_F_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_F_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_F_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_F_F64
-
-    class Inst_VOP3__V_CMPX_LT_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LT_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LT_F64
-
-    class Inst_VOP3__V_CMPX_EQ_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_EQ_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_EQ_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_EQ_F64
-
-    class Inst_VOP3__V_CMPX_LE_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LE_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LE_F64
-
-    class Inst_VOP3__V_CMPX_GT_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GT_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GT_F64
-
-    class Inst_VOP3__V_CMPX_LG_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LG_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LG_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LG_F64
-
-    class Inst_VOP3__V_CMPX_GE_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GE_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GE_F64
-
-    class Inst_VOP3__V_CMPX_O_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_O_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_O_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_O_F64
-
-    class Inst_VOP3__V_CMPX_U_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_U_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_U_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_U_F64
-
-    class Inst_VOP3__V_CMPX_NGE_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NGE_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NGE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NGE_F64
-
-    class Inst_VOP3__V_CMPX_NLG_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NLG_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NLG_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NLG_F64
-
-    class Inst_VOP3__V_CMPX_NGT_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NGT_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NGT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NGT_F64
-
-    class Inst_VOP3__V_CMPX_NLE_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NLE_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NLE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NLE_F64
-
-    class Inst_VOP3__V_CMPX_NEQ_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NEQ_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NEQ_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NEQ_F64
-
-    class Inst_VOP3__V_CMPX_NLT_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NLT_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NLT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NLT_F64
-
-    class Inst_VOP3__V_CMPX_TRU_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_TRU_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_TRU_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_TRU_F64
-
-    class Inst_VOP3__V_CMP_F_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_F_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_F_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_F_I16
-
-    class Inst_VOP3__V_CMP_LT_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LT_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LT_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LT_I16
-
-    class Inst_VOP3__V_CMP_EQ_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_EQ_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_EQ_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_EQ_I16
-
-    class Inst_VOP3__V_CMP_LE_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LE_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LE_I16
-
-    class Inst_VOP3__V_CMP_GT_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GT_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GT_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GT_I16
-
-    class Inst_VOP3__V_CMP_NE_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NE_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NE_I16
-
-    class Inst_VOP3__V_CMP_GE_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GE_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GE_I16
-
-    class Inst_VOP3__V_CMP_T_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_T_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_T_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_T_I16
-
-    class Inst_VOP3__V_CMP_F_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_F_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_F_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_F_U16
-
-    class Inst_VOP3__V_CMP_LT_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LT_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LT_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LT_U16
-
-    class Inst_VOP3__V_CMP_EQ_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_EQ_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_EQ_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_EQ_U16
-
-    class Inst_VOP3__V_CMP_LE_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LE_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LE_U16
-
-    class Inst_VOP3__V_CMP_GT_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GT_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GT_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GT_U16
-
-    class Inst_VOP3__V_CMP_NE_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NE_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NE_U16
-
-    class Inst_VOP3__V_CMP_GE_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GE_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GE_U16
-
-    class Inst_VOP3__V_CMP_T_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_T_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_T_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_T_U16
-
-    class Inst_VOP3__V_CMPX_F_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_F_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_F_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_F_I16
-
-    class Inst_VOP3__V_CMPX_LT_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LT_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LT_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LT_I16
-
-    class Inst_VOP3__V_CMPX_EQ_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_EQ_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_EQ_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_EQ_I16
-
-    class Inst_VOP3__V_CMPX_LE_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LE_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LE_I16
-
-    class Inst_VOP3__V_CMPX_GT_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GT_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GT_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GT_I16
-
-    class Inst_VOP3__V_CMPX_NE_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NE_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NE_I16
-
-    class Inst_VOP3__V_CMPX_GE_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GE_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GE_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GE_I16
-
-    class Inst_VOP3__V_CMPX_T_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_T_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_T_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_T_I16
-
-    class Inst_VOP3__V_CMPX_F_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_F_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_F_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_F_U16
-
-    class Inst_VOP3__V_CMPX_LT_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LT_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LT_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LT_U16
-
-    class Inst_VOP3__V_CMPX_EQ_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_EQ_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_EQ_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_EQ_U16
-
-    class Inst_VOP3__V_CMPX_LE_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LE_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LE_U16
-
-    class Inst_VOP3__V_CMPX_GT_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GT_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GT_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GT_U16
-
-    class Inst_VOP3__V_CMPX_NE_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NE_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NE_U16
-
-    class Inst_VOP3__V_CMPX_GE_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GE_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GE_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GE_U16
-
-    class Inst_VOP3__V_CMPX_T_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_T_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_T_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_T_U16
-
-    class Inst_VOP3__V_CMP_F_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_F_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_F_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_F_I32
-
-    class Inst_VOP3__V_CMP_LT_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LT_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LT_I32
-
-    class Inst_VOP3__V_CMP_EQ_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_EQ_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_EQ_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_EQ_I32
-
-    class Inst_VOP3__V_CMP_LE_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LE_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LE_I32
-
-    class Inst_VOP3__V_CMP_GT_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GT_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GT_I32
-
-    class Inst_VOP3__V_CMP_NE_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NE_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NE_I32
-
-    class Inst_VOP3__V_CMP_GE_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GE_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GE_I32
-
-    class Inst_VOP3__V_CMP_T_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_T_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_T_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_T_I32
-
-    class Inst_VOP3__V_CMP_F_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_F_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_F_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_F_U32
-
-    class Inst_VOP3__V_CMP_LT_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LT_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LT_U32
-
-    class Inst_VOP3__V_CMP_EQ_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_EQ_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_EQ_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_EQ_U32
-
-    class Inst_VOP3__V_CMP_LE_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LE_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LE_U32
-
-    class Inst_VOP3__V_CMP_GT_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GT_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GT_U32
-
-    class Inst_VOP3__V_CMP_NE_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NE_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NE_U32
-
-    class Inst_VOP3__V_CMP_GE_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GE_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GE_U32
-
-    class Inst_VOP3__V_CMP_T_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_T_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_T_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_T_U32
-
-    class Inst_VOP3__V_CMPX_F_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_F_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_F_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_F_I32
-
-    class Inst_VOP3__V_CMPX_LT_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LT_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LT_I32
-
-    class Inst_VOP3__V_CMPX_EQ_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_EQ_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_EQ_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_EQ_I32
-
-    class Inst_VOP3__V_CMPX_LE_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LE_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LE_I32
-
-    class Inst_VOP3__V_CMPX_GT_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GT_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GT_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GT_I32
-
-    class Inst_VOP3__V_CMPX_NE_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NE_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NE_I32
-
-    class Inst_VOP3__V_CMPX_GE_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GE_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GE_I32
-
-    class Inst_VOP3__V_CMPX_T_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_T_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_T_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_T_I32
-
-    class Inst_VOP3__V_CMPX_F_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_F_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_F_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_F_U32
-
-    class Inst_VOP3__V_CMPX_LT_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LT_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LT_U32
-
-    class Inst_VOP3__V_CMPX_EQ_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_EQ_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_EQ_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_EQ_U32
-
-    class Inst_VOP3__V_CMPX_LE_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LE_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LE_U32
-
-    class Inst_VOP3__V_CMPX_GT_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GT_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GT_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GT_U32
-
-    class Inst_VOP3__V_CMPX_NE_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NE_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NE_U32
-
-    class Inst_VOP3__V_CMPX_GE_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GE_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GE_U32
-
-    class Inst_VOP3__V_CMPX_T_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_T_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_T_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_T_U32
-
-    class Inst_VOP3__V_CMP_F_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_F_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_F_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_F_I64
-
-    class Inst_VOP3__V_CMP_LT_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LT_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LT_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LT_I64
-
-    class Inst_VOP3__V_CMP_EQ_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_EQ_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_EQ_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_EQ_I64
-
-    class Inst_VOP3__V_CMP_LE_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LE_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LE_I64
-
-    class Inst_VOP3__V_CMP_GT_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GT_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GT_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GT_I64
-
-    class Inst_VOP3__V_CMP_NE_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NE_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NE_I64
-
-    class Inst_VOP3__V_CMP_GE_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GE_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GE_I64
-
-    class Inst_VOP3__V_CMP_T_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_T_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_T_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_T_I64
-
-    class Inst_VOP3__V_CMP_F_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_F_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_F_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_F_U64
-
-    class Inst_VOP3__V_CMP_LT_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LT_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LT_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LT_U64
-
-    class Inst_VOP3__V_CMP_EQ_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_EQ_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_EQ_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_EQ_U64
-
-    class Inst_VOP3__V_CMP_LE_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_LE_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_LE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_LE_U64
-
-    class Inst_VOP3__V_CMP_GT_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GT_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GT_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GT_U64
-
-    class Inst_VOP3__V_CMP_NE_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_NE_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_NE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_NE_U64
-
-    class Inst_VOP3__V_CMP_GE_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_GE_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_GE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_GE_U64
-
-    class Inst_VOP3__V_CMP_T_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMP_T_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMP_T_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMP_T_U64
-
-    class Inst_VOP3__V_CMPX_F_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_F_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_F_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_F_I64
-
-    class Inst_VOP3__V_CMPX_LT_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LT_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LT_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LT_I64
-
-    class Inst_VOP3__V_CMPX_EQ_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_EQ_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_EQ_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_EQ_I64
-
-    class Inst_VOP3__V_CMPX_LE_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LE_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LE_I64
-
-    class Inst_VOP3__V_CMPX_GT_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GT_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GT_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GT_I64
-
-    class Inst_VOP3__V_CMPX_NE_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NE_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NE_I64
-
-    class Inst_VOP3__V_CMPX_GE_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GE_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GE_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GE_I64
-
-    class Inst_VOP3__V_CMPX_T_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_T_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_T_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_T_I64
-
-    class Inst_VOP3__V_CMPX_F_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_F_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_F_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_F_U64
-
-    class Inst_VOP3__V_CMPX_LT_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LT_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LT_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LT_U64
-
-    class Inst_VOP3__V_CMPX_EQ_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_EQ_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_EQ_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_EQ_U64
-
-    class Inst_VOP3__V_CMPX_LE_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_LE_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_LE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_LE_U64
-
-    class Inst_VOP3__V_CMPX_GT_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GT_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GT_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GT_U64
-
-    class Inst_VOP3__V_CMPX_NE_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_NE_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_NE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_NE_U64
-
-    class Inst_VOP3__V_CMPX_GE_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_GE_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_GE_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_GE_U64
-
-    class Inst_VOP3__V_CMPX_T_U64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CMPX_T_U64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CMPX_T_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //sdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CMPX_T_U64
-
-    class Inst_VOP3__V_CNDMASK_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CNDMASK_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //carryin
-                return 8;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CNDMASK_B32
-
-    class Inst_VOP3__V_ADD_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_ADD_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_ADD_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ADD_F32
-
-    class Inst_VOP3__V_SUB_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SUB_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_SUB_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUB_F32
-
-    class Inst_VOP3__V_SUBREV_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SUBREV_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_SUBREV_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUBREV_F32
-
-    class Inst_VOP3__V_MUL_LEGACY_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_LEGACY_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_LEGACY_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_LEGACY_F32
-
-    class Inst_VOP3__V_MUL_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_F32
-
-    class Inst_VOP3__V_MUL_I32_I24 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_I32_I24(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_I32_I24();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_I32_I24
-
-    class Inst_VOP3__V_MUL_HI_I32_I24 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_HI_I32_I24(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_HI_I32_I24();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_HI_I32_I24
-
-    class Inst_VOP3__V_MUL_U32_U24 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_U32_U24(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_U32_U24();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_U32_U24
-
-    class Inst_VOP3__V_MUL_HI_U32_U24 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_HI_U32_U24(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_HI_U32_U24();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_HI_U32_U24
-
-    class Inst_VOP3__V_MIN_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MIN_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MIN_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MIN_F32
-
-    class Inst_VOP3__V_MAX_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAX_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAX_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAX_F32
-
-    class Inst_VOP3__V_MIN_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MIN_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MIN_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MIN_I32
-
-    class Inst_VOP3__V_MAX_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAX_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAX_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAX_I32
-
-    class Inst_VOP3__V_MIN_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MIN_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MIN_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MIN_U32
-
-    class Inst_VOP3__V_MAX_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAX_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAX_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAX_U32
-
-    class Inst_VOP3__V_LSHRREV_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LSHRREV_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_LSHRREV_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LSHRREV_B32
-
-    class Inst_VOP3__V_ASHRREV_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_ASHRREV_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_ASHRREV_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ASHRREV_I32
-
-    class Inst_VOP3__V_LSHLREV_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LSHLREV_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_LSHLREV_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LSHLREV_B32
-
-    class Inst_VOP3__V_AND_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_AND_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_AND_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_AND_B32
-
-    class Inst_VOP3__V_OR_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_OR_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_OR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_OR_B32
-
-    class Inst_VOP3__V_XOR_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_XOR_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_XOR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_XOR_B32
-
-    class Inst_VOP3__V_MAC_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAC_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAC_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAC_F32
-
-    class Inst_VOP3__V_ADD_U32 : public Inst_VOP3_SDST_ENC
-    {
-      public:
-        Inst_VOP3__V_ADD_U32(InFmt_VOP3_SDST_ENC*);
-        ~Inst_VOP3__V_ADD_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              case 3: //carryout
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ADD_U32
-
-    class Inst_VOP3__V_SUB_U32 : public Inst_VOP3_SDST_ENC
-    {
-      public:
-        Inst_VOP3__V_SUB_U32(InFmt_VOP3_SDST_ENC*);
-        ~Inst_VOP3__V_SUB_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              case 3: //carryout
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUB_U32
-
-    class Inst_VOP3__V_SUBREV_U32 : public Inst_VOP3_SDST_ENC
-    {
-      public:
-        Inst_VOP3__V_SUBREV_U32(InFmt_VOP3_SDST_ENC*);
-        ~Inst_VOP3__V_SUBREV_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              case 3: //carryout
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUBREV_U32
-
-    class Inst_VOP3__V_ADDC_U32 : public Inst_VOP3_SDST_ENC
-    {
-      public:
-        Inst_VOP3__V_ADDC_U32(InFmt_VOP3_SDST_ENC*);
-        ~Inst_VOP3__V_ADDC_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //carryin
-                return 8;
-              case 3: //vdst
-                return 4;
-              case 4: //carryout
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ADDC_U32
-
-    class Inst_VOP3__V_SUBB_U32 : public Inst_VOP3_SDST_ENC
-    {
-      public:
-        Inst_VOP3__V_SUBB_U32(InFmt_VOP3_SDST_ENC*);
-        ~Inst_VOP3__V_SUBB_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //carryin
-                return 8;
-              case 3: //vdst
-                return 4;
-              case 4: //carryout
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUBB_U32
-
-    class Inst_VOP3__V_SUBBREV_U32 : public Inst_VOP3_SDST_ENC
-    {
-      public:
-        Inst_VOP3__V_SUBBREV_U32(InFmt_VOP3_SDST_ENC*);
-        ~Inst_VOP3__V_SUBBREV_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //carryin
-                return 8;
-              case 3: //vdst
-                return 4;
-              case 4: //carryout
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUBBREV_U32
-
-    class Inst_VOP3__V_ADD_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_ADD_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_ADD_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ADD_F16
-
-    class Inst_VOP3__V_SUB_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SUB_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_SUB_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUB_F16
-
-    class Inst_VOP3__V_SUBREV_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SUBREV_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_SUBREV_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUBREV_F16
-
-    class Inst_VOP3__V_MUL_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_F16
-
-    class Inst_VOP3__V_MAC_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAC_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAC_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAC_F16
-
-    class Inst_VOP3__V_ADD_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_ADD_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_ADD_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ADD_U16
-
-    class Inst_VOP3__V_SUB_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SUB_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_SUB_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUB_U16
-
-    class Inst_VOP3__V_SUBREV_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SUBREV_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_SUBREV_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUBREV_U16
-
-    class Inst_VOP3__V_MUL_LO_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_LO_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_LO_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_LO_U16
-
-    class Inst_VOP3__V_LSHLREV_B16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LSHLREV_B16(InFmt_VOP3*);
-        ~Inst_VOP3__V_LSHLREV_B16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LSHLREV_B16
-
-    class Inst_VOP3__V_LSHRREV_B16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LSHRREV_B16(InFmt_VOP3*);
-        ~Inst_VOP3__V_LSHRREV_B16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LSHRREV_B16
-
-    class Inst_VOP3__V_ASHRREV_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_ASHRREV_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_ASHRREV_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ASHRREV_I16
-
-    class Inst_VOP3__V_MAX_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAX_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAX_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAX_F16
-
-    class Inst_VOP3__V_MIN_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MIN_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MIN_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MIN_F16
-
-    class Inst_VOP3__V_MAX_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAX_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAX_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAX_U16
-
-    class Inst_VOP3__V_MAX_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAX_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAX_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAX_I16
-
-    class Inst_VOP3__V_MIN_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MIN_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MIN_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MIN_U16
-
-    class Inst_VOP3__V_MIN_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MIN_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MIN_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MIN_I16
-
-    class Inst_VOP3__V_LDEXP_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LDEXP_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_LDEXP_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LDEXP_F16
-
-    class Inst_VOP3__V_NOP : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_NOP(InFmt_VOP3*);
-        ~Inst_VOP3__V_NOP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_NOP
-
-    class Inst_VOP3__V_MOV_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MOV_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MOV_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MOV_B32
-
-    class Inst_VOP3__V_CVT_I32_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_I32_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_I32_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_I32_F64
-
-    class Inst_VOP3__V_CVT_F64_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F64_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F64_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F64_I32
-
-    class Inst_VOP3__V_CVT_F32_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F32_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F32_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F32_I32
-
-    class Inst_VOP3__V_CVT_F32_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F32_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F32_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F32_U32
-
-    class Inst_VOP3__V_CVT_U32_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_U32_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_U32_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_U32_F32
-
-    class Inst_VOP3__V_CVT_I32_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_I32_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_I32_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_I32_F32
-
-    class Inst_VOP3__V_MOV_FED_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MOV_FED_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MOV_FED_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MOV_FED_B32
-
-    class Inst_VOP3__V_CVT_F16_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F16_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F16_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F16_F32
-
-    class Inst_VOP3__V_CVT_F32_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F32_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F32_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F32_F16
-
-    class Inst_VOP3__V_CVT_RPI_I32_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_RPI_I32_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_RPI_I32_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_RPI_I32_F32
-
-    class Inst_VOP3__V_CVT_FLR_I32_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_FLR_I32_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_FLR_I32_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_FLR_I32_F32
-
-    class Inst_VOP3__V_CVT_OFF_F32_I4 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_OFF_F32_I4(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_OFF_F32_I4();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_OFF_F32_I4
-
-    class Inst_VOP3__V_CVT_F32_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F32_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F32_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F32_F64
-
-    class Inst_VOP3__V_CVT_F64_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F64_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F64_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F64_F32
-
-    class Inst_VOP3__V_CVT_F32_UBYTE0 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F32_UBYTE0(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F32_UBYTE0();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F32_UBYTE0
-
-    class Inst_VOP3__V_CVT_F32_UBYTE1 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F32_UBYTE1(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F32_UBYTE1();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F32_UBYTE1
-
-    class Inst_VOP3__V_CVT_F32_UBYTE2 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F32_UBYTE2(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F32_UBYTE2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F32_UBYTE2
-
-    class Inst_VOP3__V_CVT_F32_UBYTE3 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F32_UBYTE3(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F32_UBYTE3();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F32_UBYTE3
-
-    class Inst_VOP3__V_CVT_U32_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_U32_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_U32_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_U32_F64
-
-    class Inst_VOP3__V_CVT_F64_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F64_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F64_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F64_U32
-
-    class Inst_VOP3__V_TRUNC_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_TRUNC_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_TRUNC_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_TRUNC_F64
-
-    class Inst_VOP3__V_CEIL_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CEIL_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_CEIL_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CEIL_F64
-
-    class Inst_VOP3__V_RNDNE_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_RNDNE_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_RNDNE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_RNDNE_F64
-
-    class Inst_VOP3__V_FLOOR_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FLOOR_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_FLOOR_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FLOOR_F64
-
-    class Inst_VOP3__V_FRACT_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FRACT_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_FRACT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FRACT_F32
-
-    class Inst_VOP3__V_TRUNC_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_TRUNC_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_TRUNC_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_TRUNC_F32
-
-    class Inst_VOP3__V_CEIL_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CEIL_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CEIL_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CEIL_F32
-
-    class Inst_VOP3__V_RNDNE_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_RNDNE_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_RNDNE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_RNDNE_F32
-
-    class Inst_VOP3__V_FLOOR_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FLOOR_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_FLOOR_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FLOOR_F32
-
-    class Inst_VOP3__V_EXP_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_EXP_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_EXP_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_EXP_F32
-
-    class Inst_VOP3__V_LOG_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LOG_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_LOG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LOG_F32
-
-    class Inst_VOP3__V_RCP_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_RCP_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_RCP_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_RCP_F32
-
-    class Inst_VOP3__V_RCP_IFLAG_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_RCP_IFLAG_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_RCP_IFLAG_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_RCP_IFLAG_F32
-
-    class Inst_VOP3__V_RSQ_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_RSQ_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_RSQ_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_RSQ_F32
-
-    class Inst_VOP3__V_RCP_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_RCP_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_RCP_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_RCP_F64
-
-    class Inst_VOP3__V_RSQ_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_RSQ_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_RSQ_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_RSQ_F64
-
-    class Inst_VOP3__V_SQRT_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SQRT_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_SQRT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SQRT_F32
-
-    class Inst_VOP3__V_SQRT_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SQRT_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_SQRT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SQRT_F64
-
-    class Inst_VOP3__V_SIN_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SIN_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_SIN_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SIN_F32
-
-    class Inst_VOP3__V_COS_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_COS_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_COS_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_COS_F32
-
-    class Inst_VOP3__V_NOT_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_NOT_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_NOT_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_NOT_B32
-
-    class Inst_VOP3__V_BFREV_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_BFREV_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_BFREV_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_BFREV_B32
-
-    class Inst_VOP3__V_FFBH_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FFBH_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_FFBH_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FFBH_U32
-
-    class Inst_VOP3__V_FFBL_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FFBL_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_FFBL_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FFBL_B32
-
-    class Inst_VOP3__V_FFBH_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FFBH_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_FFBH_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FFBH_I32
-
-    class Inst_VOP3__V_FREXP_EXP_I32_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FREXP_EXP_I32_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_FREXP_EXP_I32_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FREXP_EXP_I32_F64
-
-    class Inst_VOP3__V_FREXP_MANT_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FREXP_MANT_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_FREXP_MANT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FREXP_MANT_F64
-
-    class Inst_VOP3__V_FRACT_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FRACT_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_FRACT_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 8;
-              case 1: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FRACT_F64
-
-    class Inst_VOP3__V_FREXP_EXP_I32_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FREXP_EXP_I32_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_FREXP_EXP_I32_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FREXP_EXP_I32_F32
-
-    class Inst_VOP3__V_FREXP_MANT_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FREXP_MANT_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_FREXP_MANT_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FREXP_MANT_F32
-
-    class Inst_VOP3__V_CLREXCP : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CLREXCP(InFmt_VOP3*);
-        ~Inst_VOP3__V_CLREXCP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CLREXCP
-
-    class Inst_VOP3__V_CVT_F16_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F16_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F16_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F16_U16
-
-    class Inst_VOP3__V_CVT_F16_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_F16_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_F16_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_F16_I16
-
-    class Inst_VOP3__V_CVT_U16_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_U16_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_U16_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_U16_F16
-
-    class Inst_VOP3__V_CVT_I16_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_I16_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_I16_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_I16_F16
-
-    class Inst_VOP3__V_RCP_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_RCP_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_RCP_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_RCP_F16
-
-    class Inst_VOP3__V_SQRT_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SQRT_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_SQRT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SQRT_F16
-
-    class Inst_VOP3__V_RSQ_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_RSQ_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_RSQ_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_RSQ_F16
-
-    class Inst_VOP3__V_LOG_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LOG_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_LOG_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LOG_F16
-
-    class Inst_VOP3__V_EXP_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_EXP_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_EXP_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_EXP_F16
-
-    class Inst_VOP3__V_FREXP_MANT_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FREXP_MANT_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_FREXP_MANT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FREXP_MANT_F16
-
-    class Inst_VOP3__V_FREXP_EXP_I16_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FREXP_EXP_I16_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_FREXP_EXP_I16_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FREXP_EXP_I16_F16
-
-    class Inst_VOP3__V_FLOOR_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FLOOR_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_FLOOR_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FLOOR_F16
-
-    class Inst_VOP3__V_CEIL_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CEIL_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_CEIL_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CEIL_F16
-
-    class Inst_VOP3__V_TRUNC_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_TRUNC_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_TRUNC_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_TRUNC_F16
-
-    class Inst_VOP3__V_RNDNE_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_RNDNE_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_RNDNE_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_RNDNE_F16
-
-    class Inst_VOP3__V_FRACT_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FRACT_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_FRACT_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FRACT_F16
-
-    class Inst_VOP3__V_SIN_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SIN_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_SIN_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SIN_F16
-
-    class Inst_VOP3__V_COS_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_COS_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_COS_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 2;
-              case 1: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_COS_F16
-
-    class Inst_VOP3__V_EXP_LEGACY_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_EXP_LEGACY_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_EXP_LEGACY_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_EXP_LEGACY_F32
-
-    class Inst_VOP3__V_LOG_LEGACY_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LOG_LEGACY_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_LOG_LEGACY_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src
-                return 4;
-              case 1: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LOG_LEGACY_F32
-
-    class Inst_VOP3__V_MAD_LEGACY_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAD_LEGACY_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAD_LEGACY_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAD_LEGACY_F32
-
-    class Inst_VOP3__V_MAD_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAD_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAD_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAD_F32
-
-    class Inst_VOP3__V_MAD_I32_I24 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAD_I32_I24(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAD_I32_I24();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAD_I32_I24
-
-    class Inst_VOP3__V_MAD_U32_U24 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAD_U32_U24(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAD_U32_U24();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAD_U32_U24
-
-    class Inst_VOP3__V_CUBEID_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CUBEID_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CUBEID_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CUBEID_F32
-
-    class Inst_VOP3__V_CUBESC_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CUBESC_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CUBESC_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CUBESC_F32
-
-    class Inst_VOP3__V_CUBETC_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CUBETC_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CUBETC_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CUBETC_F32
-
-    class Inst_VOP3__V_CUBEMA_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CUBEMA_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CUBEMA_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CUBEMA_F32
-
-    class Inst_VOP3__V_BFE_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_BFE_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_BFE_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_BFE_U32
-
-    class Inst_VOP3__V_BFE_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_BFE_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_BFE_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_BFE_I32
-
-    class Inst_VOP3__V_BFI_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_BFI_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_BFI_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_BFI_B32
-
-    class Inst_VOP3__V_FMA_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FMA_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_FMA_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FMA_F32
-
-    class Inst_VOP3__V_FMA_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FMA_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_FMA_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //src_2
-                return 8;
-              case 3: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FMA_F64
-
-    class Inst_VOP3__V_LERP_U8 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LERP_U8(InFmt_VOP3*);
-        ~Inst_VOP3__V_LERP_U8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LERP_U8
-
-    class Inst_VOP3__V_ALIGNBIT_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_ALIGNBIT_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_ALIGNBIT_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ALIGNBIT_B32
-
-    class Inst_VOP3__V_ALIGNBYTE_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_ALIGNBYTE_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_ALIGNBYTE_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ALIGNBYTE_B32
-
-    class Inst_VOP3__V_MIN3_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MIN3_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MIN3_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MIN3_F32
-
-    class Inst_VOP3__V_MIN3_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MIN3_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MIN3_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MIN3_I32
-
-    class Inst_VOP3__V_MIN3_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MIN3_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MIN3_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MIN3_U32
-
-    class Inst_VOP3__V_MAX3_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAX3_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAX3_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAX3_F32
-
-    class Inst_VOP3__V_MAX3_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAX3_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAX3_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAX3_I32
-
-    class Inst_VOP3__V_MAX3_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAX3_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAX3_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAX3_U32
-
-    class Inst_VOP3__V_MED3_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MED3_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MED3_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MED3_F32
-
-    class Inst_VOP3__V_MED3_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MED3_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MED3_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MED3_I32
-
-    class Inst_VOP3__V_MED3_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MED3_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MED3_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MED3_U32
-
-    class Inst_VOP3__V_SAD_U8 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SAD_U8(InFmt_VOP3*);
-        ~Inst_VOP3__V_SAD_U8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SAD_U8
-
-    class Inst_VOP3__V_SAD_HI_U8 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SAD_HI_U8(InFmt_VOP3*);
-        ~Inst_VOP3__V_SAD_HI_U8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SAD_HI_U8
-
-    class Inst_VOP3__V_SAD_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SAD_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_SAD_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SAD_U16
-
-    class Inst_VOP3__V_SAD_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_SAD_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_SAD_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SAD_U32
-
-    class Inst_VOP3__V_CVT_PK_U8_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_PK_U8_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_PK_U8_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_PK_U8_F32
-
-    class Inst_VOP3__V_DIV_FIXUP_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_DIV_FIXUP_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_DIV_FIXUP_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_DIV_FIXUP_F32
-
-    class Inst_VOP3__V_DIV_FIXUP_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_DIV_FIXUP_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //src_2
-                return 8;
-              case 3: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_DIV_FIXUP_F64
-
-    class Inst_VOP3__V_DIV_SCALE_F32 : public Inst_VOP3_SDST_ENC
-    {
-      public:
-        Inst_VOP3__V_DIV_SCALE_F32(InFmt_VOP3_SDST_ENC*);
-        ~Inst_VOP3__V_DIV_SCALE_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              case 4: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_DIV_SCALE_F32
-
-    class Inst_VOP3__V_DIV_SCALE_F64 : public Inst_VOP3_SDST_ENC
-    {
-      public:
-        Inst_VOP3__V_DIV_SCALE_F64(InFmt_VOP3_SDST_ENC*);
-        ~Inst_VOP3__V_DIV_SCALE_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //src_2
-                return 8;
-              case 3: //vdst
-                return 8;
-              case 4: //vcc
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_DIV_SCALE_F64
-
-    class Inst_VOP3__V_DIV_FMAS_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_DIV_FMAS_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_DIV_FMAS_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3:
-                return 8;
-              case 4: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_DIV_FMAS_F32
-
-    class Inst_VOP3__V_DIV_FMAS_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_DIV_FMAS_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //src_2
-                return 8;
-              case 3: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_DIV_FMAS_F64
-
-    class Inst_VOP3__V_MSAD_U8 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MSAD_U8(InFmt_VOP3*);
-        ~Inst_VOP3__V_MSAD_U8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MSAD_U8
-
-    class Inst_VOP3__V_QSAD_PK_U16_U8 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_QSAD_PK_U16_U8(InFmt_VOP3*);
-        ~Inst_VOP3__V_QSAD_PK_U16_U8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 8;
-              case 3: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_QSAD_PK_U16_U8
-
-    class Inst_VOP3__V_MQSAD_PK_U16_U8 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MQSAD_PK_U16_U8(InFmt_VOP3*);
-        ~Inst_VOP3__V_MQSAD_PK_U16_U8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 8;
-              case 3: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MQSAD_PK_U16_U8
-
-    class Inst_VOP3__V_MQSAD_U32_U8 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MQSAD_U32_U8(InFmt_VOP3*);
-        ~Inst_VOP3__V_MQSAD_U32_U8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 16;
-              case 3: //vdst
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MQSAD_U32_U8
-
-    class Inst_VOP3__V_MAD_U64_U32 : public Inst_VOP3_SDST_ENC
-    {
-      public:
-        Inst_VOP3__V_MAD_U64_U32(InFmt_VOP3_SDST_ENC*);
-        ~Inst_VOP3__V_MAD_U64_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              case 4: //carryout
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAD_U64_U32
-
-    class Inst_VOP3__V_MAD_I64_I32 : public Inst_VOP3_SDST_ENC
-    {
-      public:
-        Inst_VOP3__V_MAD_I64_I32(InFmt_VOP3_SDST_ENC*);
-        ~Inst_VOP3__V_MAD_I64_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 2; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              case 4: //carryout
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAD_I64_I32
-
-    class Inst_VOP3__V_MAD_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAD_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAD_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //src_2
-                return 2;
-              case 3: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAD_F16
-
-    class Inst_VOP3__V_MAD_U16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAD_U16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAD_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //src_2
-                return 2;
-              case 3: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAD_U16
-
-    class Inst_VOP3__V_MAD_I16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAD_I16(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAD_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //src_2
-                return 2;
-              case 3: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAD_I16
-
-    class Inst_VOP3__V_PERM_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_PERM_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_PERM_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //src_2
-                return 4;
-              case 3: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        // From the GCN3 ISA SPEC:
-        // byte permute(byte in[8], byte sel) {
-        //     if (sel>=13) then return 0xff;
-        //     elsif (sel==12) then return 0x00;
-        //     elsif (sel==11) then return in[7][7] * 0xff;
-        //     elsif (sel==10) then return in[5][7] * 0xff;
-        //     elsif (sel==9) then return in[3][7] * 0xff;
-        //     elsif (sel==8) then return in[1][7] * 0xff;
-        //     else return in[sel];
-        //     }
-        // NOTE: I interpret the in[x][7] notation to mean "the high order
-        // bit of byte x".
-        uint8_t
-        permute(uint64_t in_dword2x, uint32_t sel)
-        {
-            assert (sel < 256);
-            uint8_t *in = reinterpret_cast<uint8_t*>(&in_dword2x);
-            DPRINTF(GCN3, "in_dword2x = 0x%08x\n", in_dword2x);
-            DPRINTF(GCN3, "Selecting %x using index %d\n", in[sel], sel);
-            if (sel >= 13) return 0xFF;
-            else if (sel == 12) return 0;
-            else if (sel == 11) return (in[7] & 0x80) ? 0xFF : 0;
-            else if (sel == 10) return (in[5] & 0x80) ? 0xFF : 0;
-            else if (sel == 9) return (in[3] & 0x80) ? 0xFF : 0;
-            else if (sel == 8) return (in[1] & 0x80) ? 0xFF : 0;
-            else return in[sel];
-        }
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_PERM_B32
-
-    class Inst_VOP3__V_FMA_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_FMA_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_FMA_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //src_2
-                return 2;
-              case 3: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_FMA_F16
-
-    class Inst_VOP3__V_DIV_FIXUP_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_DIV_FIXUP_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_DIV_FIXUP_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 2;
-              case 1: //src_1
-                return 2;
-              case 2: //src_2
-                return 2;
-              case 3: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_DIV_FIXUP_F16
-
-    class Inst_VOP3__V_CVT_PKACCUM_U8_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_PKACCUM_U8_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_PKACCUM_U8_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_PKACCUM_U8_F32
-
-    class Inst_VOP3__V_INTERP_P1_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_INTERP_P1_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_INTERP_P1_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_ij
-                return 4;
-              case 1: //attr
-                return 32;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_INTERP_P1_F32
-
-    class Inst_VOP3__V_INTERP_P2_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_INTERP_P2_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_INTERP_P2_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_ij
-                return 4;
-              case 1: //attr
-                return 32;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_INTERP_P2_F32
-
-    class Inst_VOP3__V_INTERP_MOV_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_INTERP_MOV_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_INTERP_MOV_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //param
-                return 4;
-              case 1: //attr
-                return 32;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_INTERP_MOV_F32
-
-    class Inst_VOP3__V_INTERP_P1LL_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_INTERP_P1LL_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_INTERP_P1LL_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_ij
-                return 4;
-              case 1: //attr
-                return 2;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_INTERP_P1LL_F16
-
-    class Inst_VOP3__V_INTERP_P1LV_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_INTERP_P1LV_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_INTERP_P1LV_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_ij
-                return 4;
-              case 1: //attr
-                return 2;
-              case 2: //vgpr_add
-                return 2;
-              case 3: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_INTERP_P1LV_F16
-
-    class Inst_VOP3__V_INTERP_P2_F16 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_INTERP_P2_F16(InFmt_VOP3*);
-        ~Inst_VOP3__V_INTERP_P2_F16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_ij
-                return 4;
-              case 1: //attr
-                return 2;
-              case 2: //vgpr_add
-                return 4;
-              case 3: //vgpr_dst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_INTERP_P2_F16
-
-    class Inst_VOP3__V_ADD_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_ADD_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_ADD_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ADD_F64
-
-    class Inst_VOP3__V_MUL_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_F64
-
-    class Inst_VOP3__V_MIN_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MIN_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_MIN_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MIN_F64
-
-    class Inst_VOP3__V_MAX_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MAX_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_MAX_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 8;
-              case 2: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MAX_F64
-
-    class Inst_VOP3__V_LDEXP_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LDEXP_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_LDEXP_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LDEXP_F64
-
-    class Inst_VOP3__V_MUL_LO_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_LO_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_LO_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_LO_U32
-
-    class Inst_VOP3__V_MUL_HI_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_HI_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_HI_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_HI_U32
-
-    class Inst_VOP3__V_MUL_HI_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MUL_HI_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MUL_HI_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MUL_HI_I32
-
-    class Inst_VOP3__V_LDEXP_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LDEXP_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_LDEXP_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LDEXP_F32
-
-    class Inst_VOP3__V_READLANE_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_READLANE_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_READLANE_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vsrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //sdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_READLANE_B32
-
-    class Inst_VOP3__V_WRITELANE_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_WRITELANE_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_WRITELANE_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //ssrc_0
-                return 4;
-              case 1: //ssrc_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_WRITELANE_B32
-
-    class Inst_VOP3__V_BCNT_U32_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_BCNT_U32_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_BCNT_U32_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_BCNT_U32_B32
-
-    class Inst_VOP3__V_MBCNT_LO_U32_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MBCNT_LO_U32_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MBCNT_LO_U32_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MBCNT_LO_U32_B32
-
-    class Inst_VOP3__V_MBCNT_HI_U32_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_MBCNT_HI_U32_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_MBCNT_HI_U32_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_MBCNT_HI_U32_B32
-
-    class Inst_VOP3__V_LSHLREV_B64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3*);
-        ~Inst_VOP3__V_LSHLREV_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 8;
-              case 2: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LSHLREV_B64
-
-    class Inst_VOP3__V_LSHRREV_B64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_LSHRREV_B64(InFmt_VOP3*);
-        ~Inst_VOP3__V_LSHRREV_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 8;
-              case 2: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_LSHRREV_B64
-
-    class Inst_VOP3__V_ASHRREV_I64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_ASHRREV_I64(InFmt_VOP3*);
-        ~Inst_VOP3__V_ASHRREV_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 8;
-              case 2: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ASHRREV_I64
-
-    class Inst_VOP3__V_TRIG_PREOP_F64 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_TRIG_PREOP_F64(InFmt_VOP3*);
-        ~Inst_VOP3__V_TRIG_PREOP_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 8;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_TRIG_PREOP_F64
-
-    class Inst_VOP3__V_BFM_B32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_BFM_B32(InFmt_VOP3*);
-        ~Inst_VOP3__V_BFM_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_BFM_B32
-
-    class Inst_VOP3__V_CVT_PKNORM_I16_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_PKNORM_I16_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_PKNORM_I16_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_PKNORM_I16_F32
-
-    class Inst_VOP3__V_CVT_PKNORM_U16_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_PKNORM_U16_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_PKNORM_U16_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_PKNORM_U16_F32
-
-    class Inst_VOP3__V_CVT_PKRTZ_F16_F32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_PKRTZ_F16_F32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_PKRTZ_F16_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_PKRTZ_F16_F32
-
-    class Inst_VOP3__V_CVT_PK_U16_U32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_PK_U16_U32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_PK_U16_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_PK_U16_U32
-
-    class Inst_VOP3__V_CVT_PK_I16_I32 : public Inst_VOP3
-    {
-      public:
-        Inst_VOP3__V_CVT_PK_I16_I32(InFmt_VOP3*);
-        ~Inst_VOP3__V_CVT_PK_I16_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //src_0
-                return 4;
-              case 1: //src_1
-                return 4;
-              case 2: //vdst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_CVT_PK_I16_I32
-
-    class Inst_DS__DS_ADD_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_ADD_U32(InFmt_DS*);
-        ~Inst_DS__DS_ADD_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_ADD_U32
-
-    class Inst_DS__DS_SUB_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_SUB_U32(InFmt_DS*);
-        ~Inst_DS__DS_SUB_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_SUB_U32
-
-    class Inst_DS__DS_RSUB_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_RSUB_U32(InFmt_DS*);
-        ~Inst_DS__DS_RSUB_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_RSUB_U32
-
-    class Inst_DS__DS_INC_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_INC_U32(InFmt_DS*);
-        ~Inst_DS__DS_INC_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_INC_U32
-
-    class Inst_DS__DS_DEC_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_DEC_U32(InFmt_DS*);
-        ~Inst_DS__DS_DEC_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_DEC_U32
-
-    class Inst_DS__DS_MIN_I32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_I32(InFmt_DS*);
-        ~Inst_DS__DS_MIN_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_I32
-
-    class Inst_DS__DS_MAX_I32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_I32(InFmt_DS*);
-        ~Inst_DS__DS_MAX_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_I32
-
-    class Inst_DS__DS_MIN_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_U32(InFmt_DS*);
-        ~Inst_DS__DS_MIN_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_U32
-
-    class Inst_DS__DS_MAX_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_U32(InFmt_DS*);
-        ~Inst_DS__DS_MAX_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_U32
-
-    class Inst_DS__DS_AND_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_AND_B32(InFmt_DS*);
-        ~Inst_DS__DS_AND_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_AND_B32
-
-    class Inst_DS__DS_OR_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_OR_B32(InFmt_DS*);
-        ~Inst_DS__DS_OR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_OR_B32
-
-    class Inst_DS__DS_XOR_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_XOR_B32(InFmt_DS*);
-        ~Inst_DS__DS_XOR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_XOR_B32
-
-    class Inst_DS__DS_MSKOR_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MSKOR_B32(InFmt_DS*);
-        ~Inst_DS__DS_MSKOR_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MSKOR_B32
-
-    class Inst_DS__DS_WRITE_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE_B32(InFmt_DS*);
-        ~Inst_DS__DS_WRITE_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE_B32
-
-    class Inst_DS__DS_WRITE2_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE2_B32(InFmt_DS*);
-        ~Inst_DS__DS_WRITE2_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_d1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE2_B32
-
-    class Inst_DS__DS_WRITE2ST64_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE2ST64_B32(InFmt_DS*);
-        ~Inst_DS__DS_WRITE2ST64_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_d1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE2ST64_B32
-
-    class Inst_DS__DS_CMPST_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_CMPST_B32(InFmt_DS*);
-        ~Inst_DS__DS_CMPST_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_CMPST_B32
-
-    class Inst_DS__DS_CMPST_F32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_CMPST_F32(InFmt_DS*);
-        ~Inst_DS__DS_CMPST_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_CMPST_F32
-
-    class Inst_DS__DS_MIN_F32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_F32(InFmt_DS*);
-        ~Inst_DS__DS_MIN_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_F32
-
-    class Inst_DS__DS_MAX_F32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_F32(InFmt_DS*);
-        ~Inst_DS__DS_MAX_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_F32
-
-    class Inst_DS__DS_NOP : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_NOP(InFmt_DS*);
-        ~Inst_DS__DS_NOP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_NOP
-
-    class Inst_DS__DS_ADD_F32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_ADD_F32(InFmt_DS*);
-        ~Inst_DS__DS_ADD_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_ADD_F32
-
-    class Inst_DS__DS_WRITE_B8 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE_B8(InFmt_DS*);
-        ~Inst_DS__DS_WRITE_B8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 1;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE_B8
-
-    class Inst_DS__DS_WRITE_B16 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE_B16(InFmt_DS*);
-        ~Inst_DS__DS_WRITE_B16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE_B16
-
-    class Inst_DS__DS_ADD_RTN_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_ADD_RTN_U32(InFmt_DS*);
-        ~Inst_DS__DS_ADD_RTN_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_ADD_RTN_U32
-
-    class Inst_DS__DS_SUB_RTN_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_SUB_RTN_U32(InFmt_DS*);
-        ~Inst_DS__DS_SUB_RTN_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_SUB_RTN_U32
-
-    class Inst_DS__DS_RSUB_RTN_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_RSUB_RTN_U32(InFmt_DS*);
-        ~Inst_DS__DS_RSUB_RTN_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_RSUB_RTN_U32
-
-    class Inst_DS__DS_INC_RTN_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_INC_RTN_U32(InFmt_DS*);
-        ~Inst_DS__DS_INC_RTN_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_INC_RTN_U32
-
-    class Inst_DS__DS_DEC_RTN_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_DEC_RTN_U32(InFmt_DS*);
-        ~Inst_DS__DS_DEC_RTN_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_DEC_RTN_U32
-
-    class Inst_DS__DS_MIN_RTN_I32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_RTN_I32(InFmt_DS*);
-        ~Inst_DS__DS_MIN_RTN_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_RTN_I32
-
-    class Inst_DS__DS_MAX_RTN_I32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_RTN_I32(InFmt_DS*);
-        ~Inst_DS__DS_MAX_RTN_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_RTN_I32
-
-    class Inst_DS__DS_MIN_RTN_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_RTN_U32(InFmt_DS*);
-        ~Inst_DS__DS_MIN_RTN_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_RTN_U32
-
-    class Inst_DS__DS_MAX_RTN_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_RTN_U32(InFmt_DS*);
-        ~Inst_DS__DS_MAX_RTN_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_RTN_U32
-
-    class Inst_DS__DS_AND_RTN_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_AND_RTN_B32(InFmt_DS*);
-        ~Inst_DS__DS_AND_RTN_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_AND_RTN_B32
-
-    class Inst_DS__DS_OR_RTN_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_OR_RTN_B32(InFmt_DS*);
-        ~Inst_DS__DS_OR_RTN_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_OR_RTN_B32
-
-    class Inst_DS__DS_XOR_RTN_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_XOR_RTN_B32(InFmt_DS*);
-        ~Inst_DS__DS_XOR_RTN_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_XOR_RTN_B32
-
-    class Inst_DS__DS_MSKOR_RTN_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MSKOR_RTN_B32(InFmt_DS*);
-        ~Inst_DS__DS_MSKOR_RTN_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MSKOR_RTN_B32
-
-    class Inst_DS__DS_WRXCHG_RTN_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRXCHG_RTN_B32(InFmt_DS*);
-        ~Inst_DS__DS_WRXCHG_RTN_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRXCHG_RTN_B32
-
-    class Inst_DS__DS_WRXCHG2_RTN_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRXCHG2_RTN_B32(InFmt_DS*);
-        ~Inst_DS__DS_WRXCHG2_RTN_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRXCHG2_RTN_B32
-
-    class Inst_DS__DS_WRXCHG2ST64_RTN_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRXCHG2ST64_RTN_B32(InFmt_DS*);
-        ~Inst_DS__DS_WRXCHG2ST64_RTN_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRXCHG2ST64_RTN_B32
-
-    class Inst_DS__DS_CMPST_RTN_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_CMPST_RTN_B32(InFmt_DS*);
-        ~Inst_DS__DS_CMPST_RTN_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_CMPST_RTN_B32
-
-    class Inst_DS__DS_CMPST_RTN_F32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_CMPST_RTN_F32(InFmt_DS*);
-        ~Inst_DS__DS_CMPST_RTN_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_CMPST_RTN_F32
-
-    class Inst_DS__DS_MIN_RTN_F32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_RTN_F32(InFmt_DS*);
-        ~Inst_DS__DS_MIN_RTN_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_RTN_F32
-
-    class Inst_DS__DS_MAX_RTN_F32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_RTN_F32(InFmt_DS*);
-        ~Inst_DS__DS_MAX_RTN_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_RTN_F32
-
-    class Inst_DS__DS_WRAP_RTN_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRAP_RTN_B32(InFmt_DS*);
-        ~Inst_DS__DS_WRAP_RTN_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRAP_RTN_B32
-
-    class Inst_DS__DS_ADD_RTN_F32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_ADD_RTN_F32(InFmt_DS*);
-        ~Inst_DS__DS_ADD_RTN_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_ADD_RTN_F32
-
-    class Inst_DS__DS_READ_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ_B32(InFmt_DS*);
-        ~Inst_DS__DS_READ_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ_B32
-
-    class Inst_DS__DS_READ2_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ2_B32(InFmt_DS*);
-        ~Inst_DS__DS_READ2_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ2_B32
-
-    class Inst_DS__DS_READ2ST64_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ2ST64_B32(InFmt_DS*);
-        ~Inst_DS__DS_READ2ST64_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ2ST64_B32
-
-    class Inst_DS__DS_READ_I8 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ_I8(InFmt_DS*);
-        ~Inst_DS__DS_READ_I8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 1;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ_I8
-
-    class Inst_DS__DS_READ_U8 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ_U8(InFmt_DS*);
-        ~Inst_DS__DS_READ_U8();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 1;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ_U8
-
-    class Inst_DS__DS_READ_I16 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ_I16(InFmt_DS*);
-        ~Inst_DS__DS_READ_I16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ_I16
-
-    class Inst_DS__DS_READ_U16 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ_U16(InFmt_DS*);
-        ~Inst_DS__DS_READ_U16();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ_U16
-
-    class Inst_DS__DS_SWIZZLE_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_SWIZZLE_B32(InFmt_DS*);
-        ~Inst_DS__DS_SWIZZLE_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_SWIZZLE_B32
-
-    class Inst_DS__DS_PERMUTE_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_PERMUTE_B32(InFmt_DS*);
-        ~Inst_DS__DS_PERMUTE_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_PERMUTE_B32
-
-    class Inst_DS__DS_BPERMUTE_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_BPERMUTE_B32(InFmt_DS*);
-        ~Inst_DS__DS_BPERMUTE_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              case 2: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_BPERMUTE_B32
-
-    class Inst_DS__DS_ADD_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_ADD_U64(InFmt_DS*);
-        ~Inst_DS__DS_ADD_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_ADD_U64
-
-    class Inst_DS__DS_SUB_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_SUB_U64(InFmt_DS*);
-        ~Inst_DS__DS_SUB_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_SUB_U64
-
-    class Inst_DS__DS_RSUB_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_RSUB_U64(InFmt_DS*);
-        ~Inst_DS__DS_RSUB_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_RSUB_U64
-
-    class Inst_DS__DS_INC_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_INC_U64(InFmt_DS*);
-        ~Inst_DS__DS_INC_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_INC_U64
-
-    class Inst_DS__DS_DEC_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_DEC_U64(InFmt_DS*);
-        ~Inst_DS__DS_DEC_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_DEC_U64
-
-    class Inst_DS__DS_MIN_I64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_I64(InFmt_DS*);
-        ~Inst_DS__DS_MIN_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_I64
-
-    class Inst_DS__DS_MAX_I64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_I64(InFmt_DS*);
-        ~Inst_DS__DS_MAX_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_I64
-
-    class Inst_DS__DS_MIN_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_U64(InFmt_DS*);
-        ~Inst_DS__DS_MIN_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_U64
-
-    class Inst_DS__DS_MAX_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_U64(InFmt_DS*);
-        ~Inst_DS__DS_MAX_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_U64
-
-    class Inst_DS__DS_AND_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_AND_B64(InFmt_DS*);
-        ~Inst_DS__DS_AND_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_AND_B64
-
-    class Inst_DS__DS_OR_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_OR_B64(InFmt_DS*);
-        ~Inst_DS__DS_OR_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_OR_B64
-
-    class Inst_DS__DS_XOR_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_XOR_B64(InFmt_DS*);
-        ~Inst_DS__DS_XOR_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_XOR_B64
-
-    class Inst_DS__DS_MSKOR_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MSKOR_B64(InFmt_DS*);
-        ~Inst_DS__DS_MSKOR_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MSKOR_B64
-
-    class Inst_DS__DS_WRITE_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE_B64(InFmt_DS*);
-        ~Inst_DS__DS_WRITE_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE_B64
-
-    class Inst_DS__DS_WRITE2_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE2_B64(InFmt_DS*);
-        ~Inst_DS__DS_WRITE2_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_d1
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE2_B64
-
-    class Inst_DS__DS_WRITE2ST64_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE2ST64_B64(InFmt_DS*);
-        ~Inst_DS__DS_WRITE2ST64_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE2ST64_B64
-
-    class Inst_DS__DS_CMPST_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_CMPST_B64(InFmt_DS*);
-        ~Inst_DS__DS_CMPST_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_CMPST_B64
-
-    class Inst_DS__DS_CMPST_F64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_CMPST_F64(InFmt_DS*);
-        ~Inst_DS__DS_CMPST_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_CMPST_F64
-
-    class Inst_DS__DS_MIN_F64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_F64(InFmt_DS*);
-        ~Inst_DS__DS_MIN_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_F64
-
-    class Inst_DS__DS_MAX_F64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_F64(InFmt_DS*);
-        ~Inst_DS__DS_MAX_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_F64
-
-    class Inst_DS__DS_ADD_RTN_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_ADD_RTN_U64(InFmt_DS*);
-        ~Inst_DS__DS_ADD_RTN_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_ADD_RTN_U64
-
-    class Inst_DS__DS_SUB_RTN_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_SUB_RTN_U64(InFmt_DS*);
-        ~Inst_DS__DS_SUB_RTN_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_SUB_RTN_U64
-
-    class Inst_DS__DS_RSUB_RTN_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_RSUB_RTN_U64(InFmt_DS*);
-        ~Inst_DS__DS_RSUB_RTN_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_RSUB_RTN_U64
-
-    class Inst_DS__DS_INC_RTN_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_INC_RTN_U64(InFmt_DS*);
-        ~Inst_DS__DS_INC_RTN_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_INC_RTN_U64
-
-    class Inst_DS__DS_DEC_RTN_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_DEC_RTN_U64(InFmt_DS*);
-        ~Inst_DS__DS_DEC_RTN_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_DEC_RTN_U64
-
-    class Inst_DS__DS_MIN_RTN_I64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_RTN_I64(InFmt_DS*);
-        ~Inst_DS__DS_MIN_RTN_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_RTN_I64
-
-    class Inst_DS__DS_MAX_RTN_I64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_RTN_I64(InFmt_DS*);
-        ~Inst_DS__DS_MAX_RTN_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_RTN_I64
-
-    class Inst_DS__DS_MIN_RTN_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_RTN_U64(InFmt_DS*);
-        ~Inst_DS__DS_MIN_RTN_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_RTN_U64
-
-    class Inst_DS__DS_MAX_RTN_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_RTN_U64(InFmt_DS*);
-        ~Inst_DS__DS_MAX_RTN_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_RTN_U64
-
-    class Inst_DS__DS_AND_RTN_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_AND_RTN_B64(InFmt_DS*);
-        ~Inst_DS__DS_AND_RTN_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_AND_RTN_B64
-
-    class Inst_DS__DS_OR_RTN_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_OR_RTN_B64(InFmt_DS*);
-        ~Inst_DS__DS_OR_RTN_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_OR_RTN_B64
-
-    class Inst_DS__DS_XOR_RTN_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_XOR_RTN_B64(InFmt_DS*);
-        ~Inst_DS__DS_XOR_RTN_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_XOR_RTN_B64
-
-    class Inst_DS__DS_MSKOR_RTN_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MSKOR_RTN_B64(InFmt_DS*);
-        ~Inst_DS__DS_MSKOR_RTN_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MSKOR_RTN_B64
-
-    class Inst_DS__DS_WRXCHG_RTN_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRXCHG_RTN_B64(InFmt_DS*);
-        ~Inst_DS__DS_WRXCHG_RTN_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRXCHG_RTN_B64
-
-    class Inst_DS__DS_WRXCHG2_RTN_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRXCHG2_RTN_B64(InFmt_DS*);
-        ~Inst_DS__DS_WRXCHG2_RTN_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRXCHG2_RTN_B64
-
-    class Inst_DS__DS_WRXCHG2ST64_RTN_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRXCHG2ST64_RTN_B64(InFmt_DS*);
-        ~Inst_DS__DS_WRXCHG2ST64_RTN_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRXCHG2ST64_RTN_B64
-
-    class Inst_DS__DS_CMPST_RTN_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_CMPST_RTN_B64(InFmt_DS*);
-        ~Inst_DS__DS_CMPST_RTN_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_CMPST_RTN_B64
-
-    class Inst_DS__DS_CMPST_RTN_F64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_CMPST_RTN_F64(InFmt_DS*);
-        ~Inst_DS__DS_CMPST_RTN_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d1
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_CMPST_RTN_F64
-
-    class Inst_DS__DS_MIN_RTN_F64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_RTN_F64(InFmt_DS*);
-        ~Inst_DS__DS_MIN_RTN_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_RTN_F64
-
-    class Inst_DS__DS_MAX_RTN_F64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_RTN_F64(InFmt_DS*);
-        ~Inst_DS__DS_MAX_RTN_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_RTN_F64
-
-    class Inst_DS__DS_READ_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ_B64(InFmt_DS*);
-        ~Inst_DS__DS_READ_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ_B64
-
-    class Inst_DS__DS_READ2_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ2_B64(InFmt_DS*);
-        ~Inst_DS__DS_READ2_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ2_B64
-
-    class Inst_DS__DS_READ2ST64_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ2ST64_B64(InFmt_DS*);
-        ~Inst_DS__DS_READ2ST64_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ2ST64_B64
-
-    class Inst_DS__DS_CONDXCHG32_RTN_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_CONDXCHG32_RTN_B64(InFmt_DS*);
-        ~Inst_DS__DS_CONDXCHG32_RTN_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 8;
-              case 2: //vgpr_rtn
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_CONDXCHG32_RTN_B64
-
-    class Inst_DS__DS_ADD_SRC2_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_ADD_SRC2_U32(InFmt_DS*);
-        ~Inst_DS__DS_ADD_SRC2_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_ADD_SRC2_U32
-
-    class Inst_DS__DS_SUB_SRC2_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_SUB_SRC2_U32(InFmt_DS*);
-        ~Inst_DS__DS_SUB_SRC2_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_SUB_SRC2_U32
-
-    class Inst_DS__DS_RSUB_SRC2_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_RSUB_SRC2_U32(InFmt_DS*);
-        ~Inst_DS__DS_RSUB_SRC2_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_RSUB_SRC2_U32
-
-    class Inst_DS__DS_INC_SRC2_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_INC_SRC2_U32(InFmt_DS*);
-        ~Inst_DS__DS_INC_SRC2_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_INC_SRC2_U32
-
-    class Inst_DS__DS_DEC_SRC2_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_DEC_SRC2_U32(InFmt_DS*);
-        ~Inst_DS__DS_DEC_SRC2_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_DEC_SRC2_U32
-
-    class Inst_DS__DS_MIN_SRC2_I32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_SRC2_I32(InFmt_DS*);
-        ~Inst_DS__DS_MIN_SRC2_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_SRC2_I32
-
-    class Inst_DS__DS_MAX_SRC2_I32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_SRC2_I32(InFmt_DS*);
-        ~Inst_DS__DS_MAX_SRC2_I32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_SRC2_I32
-
-    class Inst_DS__DS_MIN_SRC2_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_SRC2_U32(InFmt_DS*);
-        ~Inst_DS__DS_MIN_SRC2_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_SRC2_U32
-
-    class Inst_DS__DS_MAX_SRC2_U32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_SRC2_U32(InFmt_DS*);
-        ~Inst_DS__DS_MAX_SRC2_U32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_SRC2_U32
-
-    class Inst_DS__DS_AND_SRC2_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_AND_SRC2_B32(InFmt_DS*);
-        ~Inst_DS__DS_AND_SRC2_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_AND_SRC2_B32
-
-    class Inst_DS__DS_OR_SRC2_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_OR_SRC2_B32(InFmt_DS*);
-        ~Inst_DS__DS_OR_SRC2_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_OR_SRC2_B32
-
-    class Inst_DS__DS_XOR_SRC2_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_XOR_SRC2_B32(InFmt_DS*);
-        ~Inst_DS__DS_XOR_SRC2_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_XOR_SRC2_B32
-
-    class Inst_DS__DS_WRITE_SRC2_B32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE_SRC2_B32(InFmt_DS*);
-        ~Inst_DS__DS_WRITE_SRC2_B32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE_SRC2_B32
-
-    class Inst_DS__DS_MIN_SRC2_F32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_SRC2_F32(InFmt_DS*);
-        ~Inst_DS__DS_MIN_SRC2_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_SRC2_F32
-
-    class Inst_DS__DS_MAX_SRC2_F32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_SRC2_F32(InFmt_DS*);
-        ~Inst_DS__DS_MAX_SRC2_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_SRC2_F32
-
-    class Inst_DS__DS_ADD_SRC2_F32 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_ADD_SRC2_F32(InFmt_DS*);
-        ~Inst_DS__DS_ADD_SRC2_F32();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_ADD_SRC2_F32
-
-    class Inst_DS__DS_GWS_SEMA_RELEASE_ALL : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_GWS_SEMA_RELEASE_ALL(InFmt_DS*);
-        ~Inst_DS__DS_GWS_SEMA_RELEASE_ALL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_GWS_SEMA_RELEASE_ALL
-
-    class Inst_DS__DS_GWS_INIT : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_GWS_INIT(InFmt_DS*);
-        ~Inst_DS__DS_GWS_INIT();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d0
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_GWS_INIT
-
-    class Inst_DS__DS_GWS_SEMA_V : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_GWS_SEMA_V(InFmt_DS*);
-        ~Inst_DS__DS_GWS_SEMA_V();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_GWS_SEMA_V
-
-    class Inst_DS__DS_GWS_SEMA_BR : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_GWS_SEMA_BR(InFmt_DS*);
-        ~Inst_DS__DS_GWS_SEMA_BR();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d0
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_GWS_SEMA_BR
-
-    class Inst_DS__DS_GWS_SEMA_P : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_GWS_SEMA_P(InFmt_DS*);
-        ~Inst_DS__DS_GWS_SEMA_P();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_GWS_SEMA_P
-
-    class Inst_DS__DS_GWS_BARRIER : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_GWS_BARRIER(InFmt_DS*);
-        ~Inst_DS__DS_GWS_BARRIER();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d0
-                return 4;
-              case 1: //vgpr_d0
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_GWS_BARRIER
-
-    class Inst_DS__DS_CONSUME : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_CONSUME(InFmt_DS*);
-        ~Inst_DS__DS_CONSUME();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_CONSUME
-
-    class Inst_DS__DS_APPEND : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_APPEND(InFmt_DS*);
-        ~Inst_DS__DS_APPEND();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_APPEND
-
-    class Inst_DS__DS_ORDERED_COUNT : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_ORDERED_COUNT(InFmt_DS*);
-        ~Inst_DS__DS_ORDERED_COUNT();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_ORDERED_COUNT
-
-    class Inst_DS__DS_ADD_SRC2_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_ADD_SRC2_U64(InFmt_DS*);
-        ~Inst_DS__DS_ADD_SRC2_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_ADD_SRC2_U64
-
-    class Inst_DS__DS_SUB_SRC2_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_SUB_SRC2_U64(InFmt_DS*);
-        ~Inst_DS__DS_SUB_SRC2_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_SUB_SRC2_U64
-
-    class Inst_DS__DS_RSUB_SRC2_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_RSUB_SRC2_U64(InFmt_DS*);
-        ~Inst_DS__DS_RSUB_SRC2_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_RSUB_SRC2_U64
-
-    class Inst_DS__DS_INC_SRC2_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_INC_SRC2_U64(InFmt_DS*);
-        ~Inst_DS__DS_INC_SRC2_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_INC_SRC2_U64
-
-    class Inst_DS__DS_DEC_SRC2_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_DEC_SRC2_U64(InFmt_DS*);
-        ~Inst_DS__DS_DEC_SRC2_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_DEC_SRC2_U64
-
-    class Inst_DS__DS_MIN_SRC2_I64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_SRC2_I64(InFmt_DS*);
-        ~Inst_DS__DS_MIN_SRC2_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_SRC2_I64
-
-    class Inst_DS__DS_MAX_SRC2_I64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_SRC2_I64(InFmt_DS*);
-        ~Inst_DS__DS_MAX_SRC2_I64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_SRC2_I64
-
-    class Inst_DS__DS_MIN_SRC2_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_SRC2_U64(InFmt_DS*);
-        ~Inst_DS__DS_MIN_SRC2_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_SRC2_U64
-
-    class Inst_DS__DS_MAX_SRC2_U64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_SRC2_U64(InFmt_DS*);
-        ~Inst_DS__DS_MAX_SRC2_U64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_SRC2_U64
-
-    class Inst_DS__DS_AND_SRC2_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_AND_SRC2_B64(InFmt_DS*);
-        ~Inst_DS__DS_AND_SRC2_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_AND_SRC2_B64
-
-    class Inst_DS__DS_OR_SRC2_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_OR_SRC2_B64(InFmt_DS*);
-        ~Inst_DS__DS_OR_SRC2_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_OR_SRC2_B64
-
-    class Inst_DS__DS_XOR_SRC2_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_XOR_SRC2_B64(InFmt_DS*);
-        ~Inst_DS__DS_XOR_SRC2_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_XOR_SRC2_B64
-
-    class Inst_DS__DS_WRITE_SRC2_B64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE_SRC2_B64(InFmt_DS*);
-        ~Inst_DS__DS_WRITE_SRC2_B64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE_SRC2_B64
-
-    class Inst_DS__DS_MIN_SRC2_F64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MIN_SRC2_F64(InFmt_DS*);
-        ~Inst_DS__DS_MIN_SRC2_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MIN_SRC2_F64
-
-    class Inst_DS__DS_MAX_SRC2_F64 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_MAX_SRC2_F64(InFmt_DS*);
-        ~Inst_DS__DS_MAX_SRC2_F64();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_MAX_SRC2_F64
-
-    class Inst_DS__DS_WRITE_B96 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE_B96(InFmt_DS*);
-        ~Inst_DS__DS_WRITE_B96();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 12;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE_B96
-
-    class Inst_DS__DS_WRITE_B128 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_WRITE_B128(InFmt_DS*);
-        ~Inst_DS__DS_WRITE_B128();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_d0
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_WRITE_B128
-
-    class Inst_DS__DS_READ_B96 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ_B96(InFmt_DS*);
-        ~Inst_DS__DS_READ_B96();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 12;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ_B96
-
-    class Inst_DS__DS_READ_B128 : public Inst_DS
-    {
-      public:
-        Inst_DS__DS_READ_B128(InFmt_DS*);
-        ~Inst_DS__DS_READ_B128();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //vgpr_rtn
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_DS__DS_READ_B128
-
-    class Inst_MUBUF__BUFFER_LOAD_FORMAT_X : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_FORMAT_X();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-
-    class Inst_MUBUF__BUFFER_LOAD_FORMAT_XY : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_FORMAT_XY(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-
-    class Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 12;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-
-    class Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-
-    class Inst_MUBUF__BUFFER_STORE_FORMAT_X : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_FORMAT_X(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_FORMAT_X();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 4;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_FORMAT_X
-
-    class Inst_MUBUF__BUFFER_STORE_FORMAT_XY : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_FORMAT_XY(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_FORMAT_XY();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 8;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-
-    class Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 12;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-
-    class Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 16;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-
-    class Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-
-    class Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-
-    class Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 12;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-
-    class Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-
-    class Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 4;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-
-    class Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 8;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-
-    class Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 12;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-
-    class Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 16;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-
-    class Inst_MUBUF__BUFFER_LOAD_UBYTE : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_UBYTE(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_UBYTE();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 4;
-              case 1: //vgpr_a
-                if (instData.OFFEN && instData.IDXEN) {
-                    // if we need an idx and offset from a
-                    // VGPR, we'll read VGPR[VADDR] and
-                    // VGPR[VADDR + 1], otherwise we just
-                    // read VGPR[VADDR]
-                    return 8;
-                } else {
-                    return 4;
-                }
-              case 2: //sgpr_r
-                return 16;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_UBYTE
-
-    class Inst_MUBUF__BUFFER_LOAD_SBYTE : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_SBYTE(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_SBYTE();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_SBYTE
-
-    class Inst_MUBUF__BUFFER_LOAD_USHORT : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_USHORT(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_USHORT();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_USHORT
-
-    class Inst_MUBUF__BUFFER_LOAD_SSHORT : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_SSHORT(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_SSHORT();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_SSHORT
-
-    class Inst_MUBUF__BUFFER_LOAD_DWORD : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_DWORD(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_DWORD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_DWORD
-
-    class Inst_MUBUF__BUFFER_LOAD_DWORDX2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_DWORDX2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_DWORDX2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_DWORDX2
-
-    class Inst_MUBUF__BUFFER_LOAD_DWORDX3 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_DWORDX3(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_DWORDX3();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 12;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_DWORDX3
-
-    class Inst_MUBUF__BUFFER_LOAD_DWORDX4 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_LOAD_DWORDX4(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_LOAD_DWORDX4();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_LOAD_DWORDX4
-
-    class Inst_MUBUF__BUFFER_STORE_BYTE : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_BYTE(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_BYTE();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 4;
-              case 1: //vgpr_a
-                if (instData.OFFEN && instData.IDXEN) {
-                    // if we need an idx and offset from a
-                    // VGPR, we'll read VGPR[VADDR] and
-                    // VGPR[VADDR + 1], otherwise we just
-                    // read VGPR[VADDR]
-                    return 8;
-                } else {
-                    return 4;
-                }
-              case 2: //sgpr_r
-                return 16;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_BYTE
-
-    class Inst_MUBUF__BUFFER_STORE_SHORT : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_SHORT(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_SHORT();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 4;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 16;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_SHORT
-
-    class Inst_MUBUF__BUFFER_STORE_DWORD : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_DWORD(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_DWORD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 4;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 16;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_DWORD
-
-    class Inst_MUBUF__BUFFER_STORE_DWORDX2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_DWORDX2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_DWORDX2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 8;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 16;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_DWORDX2
-
-    class Inst_MUBUF__BUFFER_STORE_DWORDX3 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_DWORDX3(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_DWORDX3();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 12;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 16;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_DWORDX3
-
-    class Inst_MUBUF__BUFFER_STORE_DWORDX4 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_DWORDX4(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_DWORDX4();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_s
-                return 16;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 16;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_DWORDX4
-
-    class Inst_MUBUF__BUFFER_STORE_LDS_DWORD : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_STORE_LDS_DWORD(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_STORE_LDS_DWORD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //sgpr_r
-                return 16;
-              case 1: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-
-    class Inst_MUBUF__BUFFER_WBINVL1 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_WBINVL1(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_WBINVL1();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_WBINVL1
-
-    class Inst_MUBUF__BUFFER_WBINVL1_VOL : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_WBINVL1_VOL(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_WBINVL1_VOL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 0; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_WBINVL1_VOL
-
-    class Inst_MUBUF__BUFFER_ATOMIC_SWAP : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_SWAP(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_SWAP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_SWAP
-
-    class Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-
-    class Inst_MUBUF__BUFFER_ATOMIC_ADD : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_ADD(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_ADD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_ADD
-
-    class Inst_MUBUF__BUFFER_ATOMIC_SUB : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_SUB(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_SUB();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_SUB
-
-    class Inst_MUBUF__BUFFER_ATOMIC_SMIN : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_SMIN(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_SMIN();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_SMIN
-
-    class Inst_MUBUF__BUFFER_ATOMIC_UMIN : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_UMIN(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_UMIN();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_UMIN
-
-    class Inst_MUBUF__BUFFER_ATOMIC_SMAX : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_SMAX(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_SMAX();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_SMAX
-
-    class Inst_MUBUF__BUFFER_ATOMIC_UMAX : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_UMAX(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_UMAX();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_UMAX
-
-    class Inst_MUBUF__BUFFER_ATOMIC_AND : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_AND(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_AND();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_AND
-
-    class Inst_MUBUF__BUFFER_ATOMIC_OR : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_OR(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_OR();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_OR
-
-    class Inst_MUBUF__BUFFER_ATOMIC_XOR : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_XOR(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_XOR();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_XOR
-
-    class Inst_MUBUF__BUFFER_ATOMIC_INC : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_INC(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_INC();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_INC
-
-    class Inst_MUBUF__BUFFER_ATOMIC_DEC : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_DEC(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_DEC();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_DEC
-
-    class Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_ADD_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_ADD_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_SUB_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_SUB_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_AND_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_AND_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_AND_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_OR_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_OR_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_OR_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_XOR_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_XOR_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_INC_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_INC_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_INC_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-
-    class Inst_MUBUF__BUFFER_ATOMIC_DEC_X2 : public Inst_MUBUF
-    {
-      public:
-        Inst_MUBUF__BUFFER_ATOMIC_DEC_X2(InFmt_MUBUF*);
-        ~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 16;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-
-    class Inst_MTBUF__TBUFFER_LOAD_FORMAT_X : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-
-    class Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-
-    class Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 12;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-
-    class Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-
-    class Inst_MTBUF__TBUFFER_STORE_FORMAT_X : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 32;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-
-    class Inst_MTBUF__TBUFFER_STORE_FORMAT_XY : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 8;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-
-    class Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 12;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-
-    class Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 16;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-
-    class Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-
-    class Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-
-    class Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 12;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-
-    class Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 8;
-              case 1: //sgpr_r
-                return 4;
-              case 2: //sgpr_o
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-
-    class Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 32;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-
-    class Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 8;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-
-    class Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 12;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-
-    class Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW : public Inst_MTBUF
-    {
-      public:
-        Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF*);
-        ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 16;
-              case 1: //vgpr_a
-                return 8;
-              case 2: //sgpr_r
-                return 4;
-              case 3: //sgpr_o
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-
-    class Inst_MIMG__IMAGE_LOAD : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_LOAD(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_LOAD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 16;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_LOAD
-
-    class Inst_MIMG__IMAGE_LOAD_MIP : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_LOAD_MIP(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_LOAD_MIP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 16;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_LOAD_MIP
-
-    class Inst_MIMG__IMAGE_LOAD_PCK : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_LOAD_PCK(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_LOAD_PCK();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 16;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_LOAD_PCK
-
-    class Inst_MIMG__IMAGE_LOAD_PCK_SGN : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_LOAD_PCK_SGN(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_LOAD_PCK_SGN();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 16;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_LOAD_PCK_SGN
-
-    class Inst_MIMG__IMAGE_LOAD_MIP_PCK : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_LOAD_MIP_PCK(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_LOAD_MIP_PCK();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 16;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_LOAD_MIP_PCK
-
-    class Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 16;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
-
-    class Inst_MIMG__IMAGE_STORE : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_STORE(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_STORE();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 16;
-              case 1: //vgpr_a
-                return 16;
-              case 2: //sgpr_r
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_STORE
-
-    class Inst_MIMG__IMAGE_STORE_MIP : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_STORE_MIP(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_STORE_MIP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 16;
-              case 1: //vgpr_a
-                return 16;
-              case 2: //sgpr_r
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_STORE_MIP
-
-    class Inst_MIMG__IMAGE_STORE_PCK : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_STORE_PCK(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_STORE_PCK();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 16;
-              case 1: //vgpr_a
-                return 16;
-              case 2: //sgpr_r
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_STORE_PCK
-
-    class Inst_MIMG__IMAGE_STORE_MIP_PCK : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_STORE_MIP_PCK(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_STORE_MIP_PCK();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_d
-                return 16;
-              case 1: //vgpr_a
-                return 16;
-              case 2: //sgpr_r
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_STORE_MIP_PCK
-
-    class Inst_MIMG__IMAGE_GET_RESINFO : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GET_RESINFO(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GET_RESINFO();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 16;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GET_RESINFO
-
-    class Inst_MIMG__IMAGE_ATOMIC_SWAP : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_SWAP(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_SWAP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_SWAP
-
-    class Inst_MIMG__IMAGE_ATOMIC_CMPSWAP : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_CMPSWAP(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
-
-    class Inst_MIMG__IMAGE_ATOMIC_ADD : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_ADD(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_ADD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_ADD
-
-    class Inst_MIMG__IMAGE_ATOMIC_SUB : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_SUB(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_SUB();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_SUB
-
-    class Inst_MIMG__IMAGE_ATOMIC_SMIN : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_SMIN(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_SMIN();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_SMIN
-
-    class Inst_MIMG__IMAGE_ATOMIC_UMIN : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_UMIN(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_UMIN();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_UMIN
-
-    class Inst_MIMG__IMAGE_ATOMIC_SMAX : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_SMAX(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_SMAX();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_SMAX
-
-    class Inst_MIMG__IMAGE_ATOMIC_UMAX : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_UMAX(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_UMAX();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_UMAX
-
-    class Inst_MIMG__IMAGE_ATOMIC_AND : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_AND(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_AND();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_AND
-
-    class Inst_MIMG__IMAGE_ATOMIC_OR : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_OR(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_OR();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_OR
-
-    class Inst_MIMG__IMAGE_ATOMIC_XOR : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_XOR(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_XOR();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_XOR
-
-    class Inst_MIMG__IMAGE_ATOMIC_INC : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_INC(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_INC();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_INC
-
-    class Inst_MIMG__IMAGE_ATOMIC_DEC : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_ATOMIC_DEC(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_ATOMIC_DEC();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 32;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_ATOMIC_DEC
-
-    class Inst_MIMG__IMAGE_SAMPLE : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE
-
-    class Inst_MIMG__IMAGE_SAMPLE_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_CL
-
-    class Inst_MIMG__IMAGE_SAMPLE_D : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_D(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_D();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_D
-
-    class Inst_MIMG__IMAGE_SAMPLE_D_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_D_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_D_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_D_CL
-
-    class Inst_MIMG__IMAGE_SAMPLE_L : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_L(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_L();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_L
-
-    class Inst_MIMG__IMAGE_SAMPLE_B : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_B(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_B();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_B
-
-    class Inst_MIMG__IMAGE_SAMPLE_B_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_B_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_B_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_B_CL
-
-    class Inst_MIMG__IMAGE_SAMPLE_LZ : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_LZ(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_LZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_LZ
-
-    class Inst_MIMG__IMAGE_SAMPLE_C : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_CL
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_D : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_D(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_D();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_D
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_D_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_D_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_D_CL
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_L : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_L(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_L();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_L
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_B : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_B(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_B();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_B
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_B_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_B_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_B_CL
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_LZ : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_LZ(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_LZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_LZ
-
-    class Inst_MIMG__IMAGE_SAMPLE_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_CL_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_D_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_D_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_D_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_D_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_D_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_D_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_D_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_D_CL_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_L_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_L_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_L_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_L_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_B_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_B_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_B_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_B_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_B_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_B_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_B_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_B_CL_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_LZ_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_LZ_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_LZ_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_LZ_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_CL_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_D_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_D_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_D_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_D_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_L_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_L_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_L_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_L_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_B_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_B_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_B_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_B_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_LZ_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_LZ_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
-
-    class Inst_MIMG__IMAGE_GATHER4 : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4
-
-    class Inst_MIMG__IMAGE_GATHER4_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_CL
-
-    class Inst_MIMG__IMAGE_GATHER4_L : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_L(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_L();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_L
-
-    class Inst_MIMG__IMAGE_GATHER4_B : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_B(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_B();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_B
-
-    class Inst_MIMG__IMAGE_GATHER4_B_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_B_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_B_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_B_CL
-
-    class Inst_MIMG__IMAGE_GATHER4_LZ : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_LZ(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_LZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_LZ
-
-    class Inst_MIMG__IMAGE_GATHER4_C : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C
-
-    class Inst_MIMG__IMAGE_GATHER4_C_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C_CL
-
-    class Inst_MIMG__IMAGE_GATHER4_C_L : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C_L(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C_L();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C_L
-
-    class Inst_MIMG__IMAGE_GATHER4_C_B : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C_B(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C_B();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C_B
-
-    class Inst_MIMG__IMAGE_GATHER4_C_B_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C_B_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C_B_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C_B_CL
-
-    class Inst_MIMG__IMAGE_GATHER4_C_LZ : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C_LZ(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C_LZ();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C_LZ
-
-    class Inst_MIMG__IMAGE_GATHER4_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_O
-
-    class Inst_MIMG__IMAGE_GATHER4_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_CL_O
-
-    class Inst_MIMG__IMAGE_GATHER4_L_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_L_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_L_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_L_O
-
-    class Inst_MIMG__IMAGE_GATHER4_B_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_B_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_B_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_B_O
-
-    class Inst_MIMG__IMAGE_GATHER4_B_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_B_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_B_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_B_CL_O
-
-    class Inst_MIMG__IMAGE_GATHER4_LZ_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_LZ_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_LZ_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_LZ_O
-
-    class Inst_MIMG__IMAGE_GATHER4_C_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C_O
-
-    class Inst_MIMG__IMAGE_GATHER4_C_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C_CL_O
-
-    class Inst_MIMG__IMAGE_GATHER4_C_L_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C_L_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C_L_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C_L_O
-
-    class Inst_MIMG__IMAGE_GATHER4_C_B_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C_B_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C_B_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C_B_O
-
-    class Inst_MIMG__IMAGE_GATHER4_C_B_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C_B_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
-
-    class Inst_MIMG__IMAGE_GATHER4_C_LZ_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GATHER4_C_LZ_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GATHER4_C_LZ_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GATHER4_C_LZ_O
-
-    class Inst_MIMG__IMAGE_GET_LOD : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_GET_LOD(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_GET_LOD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_GET_LOD
-
-    class Inst_MIMG__IMAGE_SAMPLE_CD : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_CD(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_CD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_CD
-
-    class Inst_MIMG__IMAGE_SAMPLE_CD_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_CD_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_CD_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_CD_CL
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_CD : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_CD(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_CD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_CD
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_CD_CL : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_CD_CL(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
-
-    class Inst_MIMG__IMAGE_SAMPLE_CD_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_CD_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_CD_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_CD_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_CD_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_CD_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_CD_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_CD_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_CD_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_CD_O
-
-    class Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O : public Inst_MIMG
-    {
-      public:
-        Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O(InFmt_MIMG*);
-        ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 3; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_a
-                return 4;
-              case 1: //sgpr_r
-                return 32;
-              case 2: //sgpr_s
-                return 4;
-              case 3: //vgpr_d
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
-
-    class Inst_EXP__EXP : public Inst_EXP
-    {
-      public:
-        Inst_EXP__EXP(InFmt_EXP*);
-        ~Inst_EXP__EXP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 4; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //
-                return 32;
-              case 1: //
-                return 32;
-              case 2: //
-                return 32;
-              case 3: //
-                return 32;
-              case 4: //
-                return 32;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_EXP__EXP
-
-    class Inst_FLAT__FLAT_LOAD_UBYTE : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_LOAD_UBYTE(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_LOAD_UBYTE();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_dst
-                return 1;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_LOAD_UBYTE
-
-    class Inst_FLAT__FLAT_LOAD_SBYTE : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_LOAD_SBYTE(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_LOAD_SBYTE();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_dst
-                return 1;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_LOAD_SBYTE
-
-    class Inst_FLAT__FLAT_LOAD_USHORT : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_LOAD_USHORT(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_LOAD_USHORT();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_dst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_LOAD_USHORT
-
-    class Inst_FLAT__FLAT_LOAD_SSHORT : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_LOAD_SSHORT(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_LOAD_SSHORT();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_dst
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_LOAD_SSHORT
-
-    class Inst_FLAT__FLAT_LOAD_DWORD : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_LOAD_DWORD(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_LOAD_DWORD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_LOAD_DWORD
-
-    class Inst_FLAT__FLAT_LOAD_DWORDX2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_LOAD_DWORDX2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_LOAD_DWORDX2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_LOAD_DWORDX2
-
-    class Inst_FLAT__FLAT_LOAD_DWORDX3 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_LOAD_DWORDX3(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_LOAD_DWORDX3();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_dst
-                return 12;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_LOAD_DWORDX3
-
-    class Inst_FLAT__FLAT_LOAD_DWORDX4 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_LOAD_DWORDX4(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_LOAD_DWORDX4();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 1; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_dst
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_LOAD_DWORDX4
-
-    class Inst_FLAT__FLAT_STORE_BYTE : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_STORE_BYTE(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_STORE_BYTE();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 1;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_STORE_BYTE
-
-    class Inst_FLAT__FLAT_STORE_SHORT : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_STORE_SHORT(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_STORE_SHORT();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 2;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_STORE_SHORT
-
-    class Inst_FLAT__FLAT_STORE_DWORD : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_STORE_DWORD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_STORE_DWORD
-
-    class Inst_FLAT__FLAT_STORE_DWORDX2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_STORE_DWORDX2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_STORE_DWORDX2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_STORE_DWORDX2
-
-    class Inst_FLAT__FLAT_STORE_DWORDX3 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_STORE_DWORDX3(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_STORE_DWORDX3();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 12;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_STORE_DWORDX3
-
-    class Inst_FLAT__FLAT_STORE_DWORDX4 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_STORE_DWORDX4(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_STORE_DWORDX4();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 0; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 16;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_STORE_DWORDX4
-
-    class Inst_FLAT__FLAT_ATOMIC_SWAP : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_SWAP(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_SWAP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_SWAP
-
-    class Inst_FLAT__FLAT_ATOMIC_CMPSWAP : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-
-    class Inst_FLAT__FLAT_ATOMIC_ADD : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_ADD(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_ADD();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_ADD
-
-    class Inst_FLAT__FLAT_ATOMIC_SUB : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_SUB(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_SUB();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_SUB
-
-    class Inst_FLAT__FLAT_ATOMIC_SMIN : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_SMIN();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_SMIN
-
-    class Inst_FLAT__FLAT_ATOMIC_UMIN : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_UMIN(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_UMIN();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_UMIN
-
-    class Inst_FLAT__FLAT_ATOMIC_SMAX : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_SMAX(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_SMAX();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_SMAX
-
-    class Inst_FLAT__FLAT_ATOMIC_UMAX : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_UMAX(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_UMAX();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_UMAX
-
-    class Inst_FLAT__FLAT_ATOMIC_AND : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_AND(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_AND();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_AND
-
-    class Inst_FLAT__FLAT_ATOMIC_OR : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_OR(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_OR();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_OR
-
-    class Inst_FLAT__FLAT_ATOMIC_XOR : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_XOR(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_XOR();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_XOR
-
-    class Inst_FLAT__FLAT_ATOMIC_INC : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_INC(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_INC();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_INC
-
-    class Inst_FLAT__FLAT_ATOMIC_DEC : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_DEC();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 4;
-              case 2: //vgpr_dst
-                return 4;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_DEC
-
-    class Inst_FLAT__FLAT_ATOMIC_SWAP_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_SWAP_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_SWAP_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_SWAP_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 16;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_ADD_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_ADD_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_ADD_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_ADD_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_SUB_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_SUB_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_SUB_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_SUB_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_SMIN_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_SMIN_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_SMIN_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_SMIN_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_UMIN_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_UMIN_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_UMIN_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_UMIN_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_SMAX_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_SMAX_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_SMAX_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_SMAX_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_UMAX_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_UMAX_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_UMAX_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_UMAX_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_AND_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_AND_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_AND_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_AND_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_OR_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_OR_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_OR_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_OR_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_XOR_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_XOR_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_XOR_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_XOR_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_INC_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_INC_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_INC_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_INC_X2
-
-    class Inst_FLAT__FLAT_ATOMIC_DEC_X2 : public Inst_FLAT
-    {
-      public:
-        Inst_FLAT__FLAT_ATOMIC_DEC_X2(InFmt_FLAT*);
-        ~Inst_FLAT__FLAT_ATOMIC_DEC_X2();
-
-        int
-        getNumOperands() override
-        {
-            return numDstRegOperands() + numSrcRegOperands();
-        } // getNumOperands
-
-        int numDstRegOperands() override { return 1; }
-        int numSrcRegOperands() override { return 2; }
-
-        int
-        getOperandSize(int opIdx) override
-        {
-            switch (opIdx) {
-              case 0: //vgpr_addr
-                return 8;
-              case 1: //vgpr_src
-                return 8;
-              case 2: //vgpr_dst
-                return 8;
-              default:
-                fatal("op idx %i out of bounds\n", opIdx);
-                return -1;
-            }
-        } // getOperandSize
-
-        void execute(GPUDynInstPtr) override;
-        void initiateAcc(GPUDynInstPtr) override;
-        void completeAcc(GPUDynInstPtr) override;
-    }; // Inst_FLAT__FLAT_ATOMIC_DEC_X2
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif // __ARCH_GCN3_INSTS_INSTRUCTIONS_HH__
diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.cc b/src/arch/amdgpu/gcn3/insts/op_encodings.cc
deleted file mode 100644
index 41b4feefc1..0000000000
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.cc
+++ /dev/null
@@ -1,1592 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "arch/amdgpu/gcn3/insts/op_encodings.hh"
-
-#include <iomanip>
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    // --- Inst_SOP2 base class methods ---
-
-    Inst_SOP2::Inst_SOP2(InFmt_SOP2 *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        setFlag(Scalar);
-
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        if (hasSecondDword(iFmt)) {
-            // copy second instruction DWORD into union
-            extData = ((MachInst)iFmt)[1];
-            _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-            varSize = 4 + 4;
-        } else {
-            varSize = 4;
-        } // if
-    } // Inst_SOP2
-
-    void
-    Inst_SOP2::initOperandInfo()
-    {
-        int opNum = 0;
-
-        // Needed because can't take addr of bitfield
-        int reg = instData.SSRC0;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              isScalarReg(instData.SSRC0), false, false);
-        opNum++;
-
-        reg = instData.SSRC1;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              isScalarReg(instData.SSRC1), false, false);
-        opNum++;
-
-        reg = instData.SDST;
-        dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                              isScalarReg(instData.SDST), false, false);
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_SOP2::instSize() const
-    {
-        return varSize;
-    } // instSize
-
-    bool
-    Inst_SOP2::hasSecondDword(InFmt_SOP2 *iFmt)
-    {
-        if (iFmt->SSRC0 == REG_SRC_LITERAL)
-            return true;
-
-        if (iFmt->SSRC1 == REG_SRC_LITERAL)
-            return true;
-
-        return false;
-    }
-
-    void
-    Inst_SOP2::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-        dis_stream << opSelectorToRegSym(instData.SDST) << ", ";
-
-        if (instData.SSRC0 == REG_SRC_LITERAL) {
-            dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(8)
-                       << _srcLiteral << ", ";
-        } else {
-            dis_stream << opSelectorToRegSym(instData.SSRC0) << ", ";
-        }
-
-        if (instData.SSRC1 == REG_SRC_LITERAL) {
-            dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(8)
-                       << _srcLiteral;
-        } else {
-            dis_stream << opSelectorToRegSym(instData.SSRC1);
-        }
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_SOPK base class methods ---
-
-    Inst_SOPK::Inst_SOPK(InFmt_SOPK *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        setFlag(Scalar);
-
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        if (hasSecondDword(iFmt)) {
-            // copy second instruction DWORD into union
-            extData = ((MachInst)iFmt)[1];
-            _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-            varSize = 4 + 4;
-        } else {
-            varSize = 4;
-        } // if
-    } // Inst_SOPK
-
-    Inst_SOPK::~Inst_SOPK()
-    {
-    } // ~Inst_SOPK
-
-    void
-    Inst_SOPK::initOperandInfo()
-    {
-        int opNum = 0;
-
-        // Needed because can't take addr of bitfield
-        int reg = instData.SDST;
-        if (numSrcRegOperands() == getNumOperands()) {
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                isScalarReg(reg), false, false);
-            opNum++;
-        }
-
-        reg = instData.SIMM16;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              false, false, true);
-        opNum++;
-
-        if (numDstRegOperands()){
-            reg = instData.SDST;
-            dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                                  isScalarReg(reg), false, false);
-        }
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_SOPK::instSize() const
-    {
-        return varSize;
-    } // instSize
-
-    bool
-    Inst_SOPK::hasSecondDword(InFmt_SOPK *iFmt)
-    {
-        /*
-          SOPK can be a 64-bit instruction, i.e., have a second dword:
-          S_SETREG_IMM32_B32 writes some or all of the LSBs of a 32-bit
-          literal constant into a hardware register;
-          the way to detect such special case is to explicitly check the
-          opcode (20/0x14)
-        */
-        if (iFmt->OP == 0x14)
-            return true;
-
-        return false;
-    }
-
-
-    void
-    Inst_SOPK::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-
-        // S_SETREG_IMM32_B32 is a 64-bit instruction, using a
-        // 32-bit literal constant
-        if (instData.OP == 0x14) {
-            dis_stream << "0x" << std::hex << std::setfill('0')
-                    << std::setw(8) << extData.imm_u32 << ", ";
-        } else {
-            dis_stream << opSelectorToRegSym(instData.SDST) << ", ";
-        }
-
-        dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(4)
-                     << instData.SIMM16;
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_SOP1 base class methods ---
-
-    Inst_SOP1::Inst_SOP1(InFmt_SOP1 *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        setFlag(Scalar);
-
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        if (hasSecondDword(iFmt)) {
-            // copy second instruction DWORD into union
-            extData = ((MachInst)iFmt)[1];
-            _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-            varSize = 4 + 4;
-        } else {
-            varSize = 4;
-        } // if
-    } // Inst_SOP1
-
-    Inst_SOP1::~Inst_SOP1()
-    {
-    } // ~Inst_SOP1
-
-    void
-    Inst_SOP1::initOperandInfo()
-    {
-        int opNum = 0;
-
-        // Needed because can't take addr of bitfield
-        int reg = instData.SSRC0;
-        if (instData.OP != 0x1C) {
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  isScalarReg(instData.SSRC0), false, false);
-            opNum++;
-        }
-
-        reg = instData.SDST;
-        dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                              isScalarReg(instData.SDST), false, false);
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_SOP1::instSize() const
-    {
-        return varSize;
-    } // instSize
-
-    bool
-    Inst_SOP1::hasSecondDword(InFmt_SOP1 *iFmt)
-    {
-        if (iFmt->SSRC0 == REG_SRC_LITERAL)
-            return true;
-
-        return false;
-    }
-
-    void
-    Inst_SOP1::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-        dis_stream << opSelectorToRegSym(instData.SDST) << ", ";
-
-        if (instData.SSRC0 == REG_SRC_LITERAL) {
-            dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(8)
-                       << extData.imm_u32;
-        } else {
-            dis_stream << opSelectorToRegSym(instData.SSRC0);
-        }
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_SOPC base class methods ---
-
-    Inst_SOPC::Inst_SOPC(InFmt_SOPC *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        setFlag(Scalar);
-
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        if (hasSecondDword(iFmt)) {
-            // copy second instruction DWORD into union
-            extData = ((MachInst)iFmt)[1];
-            _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-            varSize = 4 + 4;
-        } else {
-            varSize = 4;
-        } // if
-    } // Inst_SOPC
-
-    Inst_SOPC::~Inst_SOPC()
-    {
-    } // ~Inst_SOPC
-
-    void
-    Inst_SOPC::initOperandInfo()
-    {
-        int opNum = 0;
-
-        // Needed because can't take addr of bitfield
-        int reg = instData.SSRC0;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              isScalarReg(instData.SSRC0), false, false);
-        opNum++;
-
-        reg = instData.SSRC1;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              isScalarReg(instData.SSRC1), false, false);
-
-    }
-
-    int
-    Inst_SOPC::instSize() const
-    {
-        return varSize;
-    } // instSize
-
-    bool
-    Inst_SOPC::hasSecondDword(InFmt_SOPC *iFmt)
-    {
-        if (iFmt->SSRC0 == REG_SRC_LITERAL)
-            return true;
-
-        if (iFmt->SSRC1 == REG_SRC_LITERAL)
-            return true;
-
-        return false;
-    }
-
-    void
-    Inst_SOPC::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-
-        if (instData.SSRC0 == REG_SRC_LITERAL) {
-            dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(8)
-                       << extData.imm_u32;
-        } else {
-            dis_stream << opSelectorToRegSym(instData.SSRC0) << ", ";
-        }
-
-        if (instData.SSRC1 == REG_SRC_LITERAL) {
-            dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(8)
-                       << extData.imm_u32;
-        } else {
-            dis_stream << opSelectorToRegSym(instData.SSRC1);
-        }
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_SOPP base class methods ---
-
-    Inst_SOPP::Inst_SOPP(InFmt_SOPP *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        setFlag(Scalar);
-
-        // copy first instruction DWORD
-        instData = iFmt[0];
-    } // Inst_SOPP
-
-    Inst_SOPP::~Inst_SOPP()
-    {
-    } // ~Inst_SOPP
-
-    void
-    Inst_SOPP::initOperandInfo()
-    {
-        int opNum = 0;
-
-
-        if (numSrcRegOperands()) {
-            // Needed because can't take addr of bitfield
-            int reg = instData.SIMM16;
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  false, false, true);
-
-            opNum++;
-
-            if (readsVCC()) {
-                srcOps.emplace_back(REG_VCC_LO, getOperandSize(opNum), true,
-                                      true, false, false);
-                opNum++;
-            }
-        }
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_SOPP::instSize() const
-    {
-        return 4;
-    } // instSize
-
-    void
-    Inst_SOPP::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode;
-
-        switch (instData.OP) {
-          case 8:
-            {
-                dis_stream << " ";
-                int dest = 4 * instData.SIMM16 + 4;
-                dis_stream << "label_" << std::hex << dest;
-            }
-             break;
-          case 12:
-            {
-                dis_stream << " ";
-
-                int vm_cnt = 0;
-                int exp_cnt = 0;
-                int lgkm_cnt = 0;
-
-                vm_cnt = bits<uint16_t>(instData.SIMM16, 3, 0);
-                exp_cnt = bits<uint16_t>(instData.SIMM16, 6, 4);
-                lgkm_cnt = bits<uint16_t>(instData.SIMM16, 11, 8);
-
-                // if the counts are not maxed out, then we
-                // print out the count value
-                if (vm_cnt != 0xf) {
-                    dis_stream << "vmcnt(" << vm_cnt << ")";
-                }
-
-                if (lgkm_cnt != 0xf) {
-                    if (vm_cnt != 0xf)
-                        dis_stream << " & ";
-
-                    dis_stream << "lgkmcnt(" << lgkm_cnt << ")";
-                }
-
-                if (exp_cnt != 0x7) {
-                    if (vm_cnt != 0xf || lgkm_cnt != 0xf)
-                        dis_stream << " & ";
-
-                    dis_stream << "expcnt(" << exp_cnt << ")";
-                }
-            }
-            break;
-          default:
-            break;
-        }
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_SMEM base class methods ---
-
-    Inst_SMEM::Inst_SMEM(InFmt_SMEM *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        setFlag(Scalar);
-        setFlag(GlobalSegment);
-
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        // copy second instruction DWORD
-        extData = ((InFmt_SMEM_1 *)iFmt)[1];
-        _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-
-        if (instData.GLC)
-            setFlag(GloballyCoherent);
-    } // Inst_SMEM
-
-    Inst_SMEM::~Inst_SMEM()
-    {
-    } // ~Inst_SMEM
-
-    void
-    Inst_SMEM::initOperandInfo()
-    {
-        // Formats:
-        // 0 src + 0 dst
-        // 3 src + 0 dst
-        // 2 src + 1 dst
-        // 0 src + 1 dst
-        int opNum = 0;
-        // Needed because can't take addr of bitfield
-        int reg = 0;
-
-        if (numSrcRegOperands()) {
-            reg = instData.SDATA;
-            if (numSrcRegOperands() == getNumOperands()) {
-                srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                      isScalarReg(reg), false, false);
-                opNum++;
-            }
-
-            reg = instData.SBASE;
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  true, false, false);
-            opNum++;
-
-            reg = extData.OFFSET;
-            if (instData.IMM) {
-                srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                      false, false, true);
-            } else {
-                srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                      isScalarReg(reg), false, false);
-            }
-            opNum++;
-        }
-
-        if (numDstRegOperands()) {
-            reg = instData.SDATA;
-            dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                                  isScalarReg(reg), false, false);
-        }
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_SMEM::instSize() const
-    {
-        return 8;
-    } // instSize
-
-    void
-    Inst_SMEM::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-        if (numDstRegOperands()) {
-            if (getOperandSize(getNumOperands() - 1) > 4) {
-                dis_stream << "s[" << instData.SDATA << ":"
-                    << instData.SDATA + getOperandSize(getNumOperands() - 1) /
-                    4 - 1 << "], ";
-            } else {
-                dis_stream << "s" << instData.SDATA << ", ";
-            }
-        }
-
-        // SBASE has an implied LSB of 0, so we need
-        // to shift by one to get the actual value
-        dis_stream << "s[" << (instData.SBASE << 1) << ":"
-               << ((instData.SBASE << 1) + 1) << "], ";
-
-        if (instData.IMM) {
-            // IMM == 1 implies OFFSET should be
-            // used as the offset
-            dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(2)
-                       << extData.OFFSET;
-        } else {
-            // IMM == 0 implies OFFSET should be
-            // used to specify SGRP in which the
-            // offset is held
-            dis_stream << "s" << extData.OFFSET;
-        }
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_VOP2 base class methods ---
-
-    Inst_VOP2::Inst_VOP2(InFmt_VOP2 *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        if (hasSecondDword(iFmt)) {
-            // copy second instruction DWORD into union
-            extData = ((MachInst)iFmt)[1];
-            _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-            varSize = 4 + 4;
-            if (iFmt->SRC0 == REG_SRC_DPP) {
-                setFlag(IsDPP);
-            } else if (iFmt->SRC0 == REG_SRC_SWDA) {
-                setFlag(IsSDWA);
-            }
-        } else {
-            varSize = 4;
-        } // if
-    } // Inst_VOP2
-
-    Inst_VOP2::~Inst_VOP2()
-    {
-    } // ~Inst_VOP2
-
-    void
-    Inst_VOP2::initOperandInfo()
-    {
-        int opNum = 0;
-
-        // Needed because can't take addr of bitfield
-        int reg = instData.SRC0;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              isScalarReg(reg), isVectorReg(reg), false);
-        opNum++;
-
-        reg = instData.VSRC1;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              false, true, false);
-        opNum++;
-
-        // VCC read
-        if (readsVCC()) {
-            srcOps.emplace_back(REG_VCC_LO, getOperandSize(opNum), true,
-                                  true, false, false);
-            opNum++;
-        }
-
-        // VDST
-        reg = instData.VDST;
-        dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                              false, true, false);
-        opNum++;
-
-        // VCC write
-        if (writesVCC()) {
-            dstOps.emplace_back(REG_VCC_LO, getOperandSize(opNum), false,
-                                  true, false, false);
-        }
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_VOP2::instSize() const
-    {
-        return varSize;
-    } // instSize
-
-    bool
-    Inst_VOP2::hasSecondDword(InFmt_VOP2 *iFmt)
-    {
-        /*
-          There are a few cases where VOP2 instructions have a second dword:
-
-            1.  SRC0 is a literal
-            2.  SRC0 is being used to add a data parallel primitive (DPP)
-            operation to the instruction.
-            3.  SRC0 is being used for sub d-word addressing (SDWA) of the
-            operands in the instruction.
-            4.  VOP2 instructions also have four special opcodes:',
-            V_MADMK_{F16, F32} (0x24, 0x17), and V_MADAK_{F16, F32}',
-            (0x25, 0x18), that are always 64b. the only way to',
-            detect these special cases is to explicitly check,',
-            the opcodes',
-        */
-        if (iFmt->SRC0 == REG_SRC_LITERAL || (iFmt->SRC0 == REG_SRC_DPP) ||
-            (iFmt->SRC0 == REG_SRC_SWDA) || iFmt->OP == 0x17 ||
-            iFmt->OP == 0x18 || iFmt->OP == 0x24 || iFmt->OP == 0x25)
-            return true;
-
-        return false;
-    }
-
-    void
-    Inst_VOP2::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-        dis_stream << "v" << instData.VDST << ", ";
-
-        if (writesVCC())
-            dis_stream << "vcc, ";
-
-        if ((instData.SRC0 == REG_SRC_LITERAL) ||
-            (instData.SRC0 == REG_SRC_DPP) ||
-            (instData.SRC0 == REG_SRC_SWDA)) {
-            dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(8)
-                       << _srcLiteral << ", ";
-        } else {
-            dis_stream << opSelectorToRegSym(instData.SRC0) << ", ";
-        }
-
-        // VOP2 instructions have four special opcodes:',
-        // V_MADMK_{F16, F32} (0x24, 0x17), and V_MADAK_{F16, F32}',
-        // (0x25, 0x18), that are always 64b. the only way to',
-        // detect these special cases is to explicitly check,',
-        // the opcodes',
-        if (instData.OP == 0x17 || instData.OP == 0x18 || instData.OP == 0x24
-            || instData.OP == 0x25) {
-            dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(8)
-                       << extData.imm_u32 << ", ";
-        }
-
-        dis_stream << std::resetiosflags(std::ios_base::basefield) << "v"
-            << instData.VSRC1;
-
-        if (readsVCC())
-            dis_stream << ", vcc";
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_VOP1 base class methods ---
-
-    Inst_VOP1::Inst_VOP1(InFmt_VOP1 *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        if (hasSecondDword(iFmt)) {
-            // copy second instruction DWORD into union
-            extData = ((MachInst)iFmt)[1];
-            _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-            varSize = 4 + 4;
-            if (iFmt->SRC0 == REG_SRC_DPP) {
-                setFlag(IsDPP);
-            } else if (iFmt->SRC0 == REG_SRC_SWDA) {
-                setFlag(IsSDWA);
-            }
-        } else {
-            varSize = 4;
-        } // if
-    } // Inst_VOP1
-
-    Inst_VOP1::~Inst_VOP1()
-    {
-    } // ~Inst_VOP1
-
-    void
-    Inst_VOP1::initOperandInfo()
-    {
-        int opNum = 0;
-        // Needed because can't take addr of bitfield
-        int reg = instData.SRC0;
-
-        if (numSrcRegOperands()) {
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  isScalarReg(reg), isVectorReg(reg), false);
-            opNum++;
-        }
-
-        if (numDstRegOperands()) {
-            reg = instData.VDST;
-            dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                                  false, true, false);
-        }
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_VOP1::instSize() const
-    {
-        return varSize;
-    } // instSize
-
-    bool
-    Inst_VOP1::hasSecondDword(InFmt_VOP1 *iFmt)
-    {
-        /*
-          There are several cases where VOP1 instructions have a second dword:
-
-            1.  SRC0 is a literal.
-            2.  SRC0 is being used to add a data parallel primitive (DPP)
-            operation to the instruction.
-            3.  SRC0 is being used for sub d-word addressing (SDWA) of the
-            operands in the instruction.
-        */
-        if ((iFmt->SRC0 == REG_SRC_LITERAL) || (iFmt->SRC0 == REG_SRC_DPP) ||
-            (iFmt->SRC0 == REG_SRC_SWDA))
-            return true;
-
-        return false;
-    }
-
-    void
-    Inst_VOP1::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-        dis_stream << "v" << instData.VDST << ", ";
-
-        if ((instData.SRC0 == REG_SRC_LITERAL) ||
-            (instData.SRC0 == REG_SRC_DPP) ||
-            (instData.SRC0 == REG_SRC_SWDA)) {
-            dis_stream << "0x" << std::hex << std::setfill('0') << std::setw(8)
-                       << _srcLiteral;
-        } else {
-            dis_stream << opSelectorToRegSym(instData.SRC0);
-        }
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_VOPC base class methods ---
-
-    Inst_VOPC::Inst_VOPC(InFmt_VOPC *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        setFlag(WritesVCC);
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        if (hasSecondDword(iFmt)) {
-            // copy second instruction DWORD into union
-            extData = ((MachInst)iFmt)[1];
-            _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-            varSize = 4 + 4;
-            if (iFmt->SRC0 == REG_SRC_DPP) {
-                setFlag(IsDPP);
-            } else if (iFmt->SRC0 == REG_SRC_SWDA) {
-                setFlag(IsSDWA);
-            }
-        } else {
-            varSize = 4;
-        } // if
-    } // Inst_VOPC
-
-    Inst_VOPC::~Inst_VOPC()
-    {
-    } // ~Inst_VOPC
-
-    void
-    Inst_VOPC::initOperandInfo()
-    {
-        int opNum = 0;
-
-        // Needed because can't take addr of bitfield
-        int reg = instData.SRC0;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              isScalarReg(reg), isVectorReg(reg), false);
-        opNum++;
-
-        reg = instData.VSRC1;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              false, true, false);
-        opNum++;
-
-        assert(writesVCC());
-        dstOps.emplace_back(REG_VCC_LO, getOperandSize(opNum), false,
-                              true, false, false);
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_VOPC::instSize() const
-    {
-        return varSize;
-    } // instSize
-
-    bool
-    Inst_VOPC::hasSecondDword(InFmt_VOPC *iFmt)
-    {
-        /*
-          There are several cases where VOPC instructions have a second dword:
-
-            1.  SRC0 is a literal.
-            2.  SRC0 is being used to add a data parallel primitive (DPP)
-            operation to the instruction.
-            3.  SRC0 is being used for sub d-word addressing (SDWA) of the
-            operands in the instruction.
-        */
-        if ((iFmt->SRC0 == REG_SRC_LITERAL) || (iFmt->SRC0 == REG_SRC_DPP) ||
-            (iFmt->SRC0 == REG_SRC_SWDA))
-            return true;
-
-        return false;
-    }
-
-    void
-    Inst_VOPC::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " vcc, ";
-
-        dis_stream << opSelectorToRegSym(instData.SRC0) << ", ";
-        dis_stream << "v" << instData.VSRC1;
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_VINTRP base class methods ---
-
-    Inst_VINTRP::Inst_VINTRP(InFmt_VINTRP *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        // copy first instruction DWORD
-        instData = iFmt[0];
-    } // Inst_VINTRP
-
-    Inst_VINTRP::~Inst_VINTRP()
-    {
-    } // ~Inst_VINTRP
-
-    int
-    Inst_VINTRP::instSize() const
-    {
-        return 4;
-    } // instSize
-
-    // --- Inst_VOP3 base class methods ---
-
-    Inst_VOP3::Inst_VOP3(InFmt_VOP3 *iFmt, const std::string &opcode,
-                         bool sgpr_dst)
-        : GCN3GPUStaticInst(opcode), sgprDst(sgpr_dst)
-    {
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        // copy second instruction DWORD
-        extData = ((InFmt_VOP3_1 *)iFmt)[1];
-        _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-    } // Inst_VOP3
-
-    Inst_VOP3::~Inst_VOP3()
-    {
-    } // ~Inst_VOP3
-
-    void
-    Inst_VOP3::initOperandInfo()
-    {
-        // Also takes care of bitfield addr issue
-        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
-
-        int opNum = 0;
-
-        int numSrc = numSrcRegOperands() - readsVCC();
-        int numDst = numDstRegOperands() - writesVCC();
-
-        for (opNum = 0; opNum < numSrc; opNum++) {
-            srcOps.emplace_back(srcs[opNum], getOperandSize(opNum), true,
-                                  isScalarReg(srcs[opNum]),
-                                  isVectorReg(srcs[opNum]), false);
-        }
-
-        if (readsVCC()) {
-            srcOps.emplace_back(REG_VCC_LO, getOperandSize(opNum), true,
-                                  true, false, false);
-            opNum++;
-        }
-
-        if (numDst) {
-            // Needed because can't take addr of bitfield
-            int reg = instData.VDST;
-            dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                                  sgprDst, !sgprDst, false);
-            opNum++;
-        }
-
-        if (writesVCC()) {
-            dstOps.emplace_back(REG_VCC_LO, getOperandSize(opNum), false,
-                                  true, false, false);
-        }
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_VOP3::instSize() const
-    {
-        return 8;
-    } // instSize
-
-    void
-    Inst_VOP3::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-        int num_regs = 0;
-
-        if (getOperandSize(getNumOperands() - 1) > 4) {
-            num_regs = getOperandSize(getNumOperands() - 1) / 4;
-            if (sgprDst)
-                dis_stream << "s[";
-            else
-                dis_stream << "v[";
-            dis_stream << instData.VDST << ":" << instData.VDST +
-                          num_regs - 1 << "], ";
-        } else {
-            if (sgprDst)
-                dis_stream << "s";
-            else
-                dis_stream << "v";
-            dis_stream << instData.VDST << ", ";
-        }
-
-        num_regs = getOperandSize(0) / 4;
-
-        if (extData.NEG & 0x1) {
-            dis_stream << "-" << opSelectorToRegSym(extData.SRC0, num_regs);
-        } else {
-            dis_stream << opSelectorToRegSym(extData.SRC0, num_regs);
-        }
-
-        if (numSrcRegOperands() > 1) {
-            num_regs = getOperandSize(1) / 4;
-
-            if (extData.NEG & 0x2) {
-                dis_stream << ", -"
-                    << opSelectorToRegSym(extData.SRC1, num_regs);
-            } else {
-                dis_stream << ", "
-                    << opSelectorToRegSym(extData.SRC1, num_regs);
-            }
-        }
-
-        if (numSrcRegOperands() > 2) {
-            num_regs = getOperandSize(2) / 4;
-
-            if (extData.NEG & 0x4) {
-                dis_stream << ", -"
-                    << opSelectorToRegSym(extData.SRC2, num_regs);
-            } else {
-                dis_stream << ", "
-                    << opSelectorToRegSym(extData.SRC2, num_regs);
-            }
-        }
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_VOP3_SDST_ENC base class methods ---
-
-    Inst_VOP3_SDST_ENC::Inst_VOP3_SDST_ENC(InFmt_VOP3_SDST_ENC *iFmt,
-                                           const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        // copy second instruction DWORD
-        extData = ((InFmt_VOP3_1 *)iFmt)[1];
-        _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-    } // Inst_VOP3_SDST_ENC
-
-    Inst_VOP3_SDST_ENC::~Inst_VOP3_SDST_ENC()
-    {
-    } // ~Inst_VOP3_SDST_ENC
-
-    void
-    Inst_VOP3_SDST_ENC::initOperandInfo()
-    {
-        // Also takes care of bitfield addr issue
-        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
-
-        int opNum = 0;
-
-        int numSrc = numSrcRegOperands() - readsVCC();
-        int numDst = numDstRegOperands() - writesVCC();
-
-        for (opNum = 0; opNum < numSrc; opNum++) {
-            srcOps.emplace_back(srcs[opNum], getOperandSize(opNum), true,
-                                  isScalarReg(srcs[opNum]),
-                                  isVectorReg(srcs[opNum]), false);
-        }
-
-        if (readsVCC()) {
-            srcOps.emplace_back(REG_VCC_LO, getOperandSize(opNum), true,
-                                  true, false, false);
-            opNum++;
-        }
-
-        if (numDst) {
-            // Needed because can't take addr of bitfield
-            int reg = instData.VDST;
-            dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                                  false, true, false);
-            opNum++;
-        }
-
-        if (writesVCC()) {
-            dstOps.emplace_back(REG_VCC_LO, getOperandSize(opNum), false,
-                                  true, false, false);
-        }
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_VOP3_SDST_ENC::instSize() const
-    {
-        return 8;
-    } // instSize
-
-    void
-    Inst_VOP3_SDST_ENC::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-
-        dis_stream << "v" << instData.VDST << ", ";
-
-        if (numDstRegOperands() == 2) {
-            if (getOperandSize(getNumOperands() - 1) > 4) {
-                int num_regs = getOperandSize(getNumOperands() - 1) / 4;
-                dis_stream << opSelectorToRegSym(instData.SDST, num_regs)
-                    << ", ";
-            } else {
-                dis_stream << opSelectorToRegSym(instData.SDST) << ", ";
-            }
-        }
-
-        if (extData.NEG & 0x1) {
-            dis_stream << "-" << opSelectorToRegSym(extData.SRC0) << ", ";
-        } else {
-            dis_stream << opSelectorToRegSym(extData.SRC0) << ", ";
-        }
-
-        if (extData.NEG & 0x2) {
-            dis_stream << "-" << opSelectorToRegSym(extData.SRC1);
-        } else {
-            dis_stream << opSelectorToRegSym(extData.SRC1);
-        }
-
-        if (numSrcRegOperands() == 3) {
-            if (extData.NEG & 0x4) {
-                dis_stream << ", -" << opSelectorToRegSym(extData.SRC2);
-            } else {
-                dis_stream << ", " << opSelectorToRegSym(extData.SRC2);
-            }
-        }
-
-        if (readsVCC())
-            dis_stream << ", vcc";
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_DS base class methods ---
-
-    Inst_DS::Inst_DS(InFmt_DS *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        setFlag(GroupSegment);
-
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        // copy second instruction DWORD
-        extData = ((InFmt_DS_1 *)iFmt)[1];
-        _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-    } // Inst_DS
-
-    Inst_DS::~Inst_DS()
-    {
-    } // ~Inst_DS
-
-    void
-    Inst_DS::initOperandInfo()
-    {
-        unsigned int srcs[3] = {extData.ADDR, extData.DATA0, extData.DATA1};
-
-        int opIdx = 0;
-
-        for (opIdx = 0; opIdx < numSrcRegOperands(); opIdx++){
-            srcOps.emplace_back(srcs[opIdx], getOperandSize(opIdx), true,
-                                  false, true, false);
-        }
-
-        if (numDstRegOperands()) {
-            // Needed because can't take addr of bitfield
-            int reg = extData.VDST;
-            dstOps.emplace_back(reg, getOperandSize(opIdx), false,
-                                  false, true, false);
-        }
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_DS::instSize() const
-    {
-        return 8;
-    } // instSize
-
-    void
-    Inst_DS::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-
-        if (numDstRegOperands())
-            dis_stream << "v" << extData.VDST << ", ";
-
-        dis_stream << "v" << extData.ADDR;
-
-        if (numSrcRegOperands() > 1)
-            dis_stream << ", v" << extData.DATA0;
-
-        if (numSrcRegOperands() > 2)
-            dis_stream << ", v" << extData.DATA1;
-
-        uint16_t offset = 0;
-
-        if (instData.OFFSET1) {
-            offset += instData.OFFSET1;
-            offset <<= 8;
-        }
-
-        if (instData.OFFSET0)
-            offset += instData.OFFSET0;
-
-        if (offset)
-            dis_stream << " offset:" << offset;
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_MUBUF base class methods ---
-
-    Inst_MUBUF::Inst_MUBUF(InFmt_MUBUF *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        // copy second instruction DWORD
-        extData = ((InFmt_MUBUF_1 *)iFmt)[1];
-        _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-
-        if (instData.GLC)
-            setFlag(GloballyCoherent);
-
-        if (instData.SLC)
-            setFlag(SystemCoherent);
-    } // Inst_MUBUF
-
-    Inst_MUBUF::~Inst_MUBUF()
-    {
-    } // ~Inst_MUBUF
-
-    void
-    Inst_MUBUF::initOperandInfo()
-    {
-        // Currently there are three formats:
-        // 0 src + 0 dst
-        // 3 src + 1 dst
-        // 4 src + 0 dst
-        int opNum = 0;
-
-        // Needed because can't take addr of bitfield;
-        int reg = 0;
-
-        if (numSrcRegOperands()) {
-            if (numSrcRegOperands() == getNumOperands()) {
-                reg = extData.VDATA;
-                srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                      false, true, false);
-                opNum++;
-            }
-
-            reg = extData.VADDR;
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  false, true, false);
-            opNum++;
-
-            reg = extData.SRSRC;
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  isScalarReg(reg), false, false);
-            opNum++;
-
-            reg = extData.SOFFSET;
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  isScalarReg(reg), false, false);
-            opNum++;
-        }
-
-        // extData.VDATA moves in the reg list depending on the instruction
-        if (numDstRegOperands()) {
-            reg = extData.VDATA;
-            dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                                  false, true, false);
-        }
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_MUBUF::instSize() const
-    {
-        return 8;
-    } // instSize
-
-    void
-    Inst_MUBUF::generateDisassembly()
-    {
-        // SRSRC is always in units of 4 SGPRs
-        int srsrc_val = extData.SRSRC * 4;
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-        dis_stream << "v" << extData.VDATA << ", v" << extData.VADDR << ", ";
-        dis_stream << "s[" << srsrc_val << ":"
-                   << srsrc_val + 3 << "], ";
-        dis_stream << "s" << extData.SOFFSET;
-
-        if (instData.OFFSET)
-            dis_stream << ", offset:" << instData.OFFSET;
-
-        disassembly = dis_stream.str();
-    }
-
-    // --- Inst_MTBUF base class methods ---
-
-    Inst_MTBUF::Inst_MTBUF(InFmt_MTBUF *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        // copy second instruction DWORD
-        extData = ((InFmt_MTBUF_1 *)iFmt)[1];
-        _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-
-        if (instData.GLC)
-            setFlag(GloballyCoherent);
-
-        if (extData.SLC)
-            setFlag(SystemCoherent);
-    } // Inst_MTBUF
-
-    Inst_MTBUF::~Inst_MTBUF()
-    {
-    } // ~Inst_MTBUF
-
-    void
-    Inst_MTBUF::initOperandInfo()
-    {
-        // Currently there are two formats:
-        // 3 src + 1 dst
-        // 4 src + 0 dst
-        int opNum = 0;
-
-        // Needed because can't take addr of bitfield
-        int reg = 0;
-
-        if (numSrcRegOperands() == getNumOperands()) {
-            reg = extData.VDATA;
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  false, true, false);
-            opNum++;
-        }
-
-        reg = extData.VADDR;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              false, true, false);
-        opNum++;
-
-        reg = extData.SRSRC;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              isScalarReg(reg), false, false);
-        opNum++;
-
-        reg = extData.SOFFSET;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              isScalarReg(reg), false, false);
-        opNum++;
-
-        // extData.VDATA moves in the reg list depending on the instruction
-        if (numDstRegOperands()) {
-            reg = extData.VDATA;
-            dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                                  false, true, false);
-        }
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_MTBUF::instSize() const
-    {
-        return 8;
-    } // instSize
-
-    // --- Inst_MIMG base class methods ---
-
-    Inst_MIMG::Inst_MIMG(InFmt_MIMG *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        // copy second instruction DWORD
-        extData = ((InFmt_MIMG_1 *)iFmt)[1];
-        _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-
-        if (instData.GLC)
-            setFlag(GloballyCoherent);
-
-        if (instData.SLC)
-            setFlag(SystemCoherent);
-    } // Inst_MIMG
-
-    Inst_MIMG::~Inst_MIMG()
-    {
-    } // ~Inst_MIMG
-
-    void
-    Inst_MIMG::initOperandInfo()
-    {
-        // Three formats:
-        // 1 dst + 2 src : s,s,d
-        // 0 dst + 3 src : s,s,s
-        // 1 dst + 3 src : s,s,s,d
-        int opNum = 0;
-
-        // Needed because can't take addr of bitfield
-        int reg = 0;
-
-        if (numSrcRegOperands() == getNumOperands()) {
-            reg = extData.VDATA;
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  false, true, false);
-            opNum++;
-        }
-
-        reg = extData.VADDR;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              false, true, false);
-        opNum++;
-
-        reg = extData.SRSRC;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              isScalarReg(reg), false, false);
-        opNum++;
-
-        if (getNumOperands() == 4) {
-            reg = extData.SSAMP;
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  isScalarReg(reg), false, false);
-            opNum++;
-        }
-
-        // extData.VDATA moves in the reg list depending on the instruction
-        if (numDstRegOperands()) {
-            reg = extData.VDATA;
-            dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                                  false, true, false);
-        }
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_MIMG::instSize() const
-    {
-        return 8;
-    } // instSize
-
-    // --- Inst_EXP base class methods ---
-
-    Inst_EXP::Inst_EXP(InFmt_EXP *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        // copy second instruction DWORD
-        extData = ((InFmt_EXP_1 *)iFmt)[1];
-        _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-    } // Inst_EXP
-
-    Inst_EXP::~Inst_EXP()
-    {
-    } // ~Inst_EXP
-
-    void
-    Inst_EXP::initOperandInfo()
-    {
-        // Only 1 instruction, 1 format: 1 dst + 4 src
-        int opNum = 0;
-
-        // Avoids taking addr of bitfield
-        unsigned int srcs[4] = {extData.VSRC0, extData.VSRC1,
-                                extData.VSRC2, extData.VSRC3};
-
-        for (opNum = 0; opNum < 4; opNum++) {
-            srcOps.emplace_back(srcs[opNum], getOperandSize(opNum), true,
-                                  false, true, false);
-        }
-
-        //TODO: Add the dst operand, don't know what it is right now
-    }
-
-    int
-    Inst_EXP::instSize() const
-    {
-        return 8;
-    } // instSize
-
-    // --- Inst_FLAT base class methods ---
-
-    Inst_FLAT::Inst_FLAT(InFmt_FLAT *iFmt, const std::string &opcode)
-        : GCN3GPUStaticInst(opcode)
-    {
-        setFlag(Flat);
-        // copy first instruction DWORD
-        instData = iFmt[0];
-        // copy second instruction DWORD
-        extData = ((InFmt_FLAT_1 *)iFmt)[1];
-        _srcLiteral = *reinterpret_cast<uint32_t*>(&iFmt[1]);
-
-        if (instData.GLC)
-            setFlag(GloballyCoherent);
-
-        if (instData.SLC)
-            setFlag(SystemCoherent);
-    } // Inst_FLAT
-
-    Inst_FLAT::~Inst_FLAT()
-    {
-    } // ~Inst_FLAT
-
-    void
-    Inst_FLAT::initOperandInfo()
-    {
-        //3 formats:
-        // 1 dst + 1 src (load)
-        // 0 dst + 2 src (store)
-        // 1 dst + 2 src (atomic)
-        int opNum = 0;
-
-        // Needed because can't take addr of bitfield
-        int reg = 0;
-
-        if (getNumOperands() > 2)
-            assert(isAtomic());
-
-        reg = extData.ADDR;
-        srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              false, true, false);
-        opNum++;
-
-        if (numSrcRegOperands() == 2) {
-            reg = extData.DATA;
-            srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  false, true, false);
-            opNum++;
-        }
-
-        if (numDstRegOperands()) {
-            reg = extData.VDST;
-            dstOps.emplace_back(reg, getOperandSize(opNum), false,
-                                  false, true, false);
-        }
-
-        assert(srcOps.size() == numSrcRegOperands());
-        assert(dstOps.size() == numDstRegOperands());
-    }
-
-    int
-    Inst_FLAT::instSize() const
-    {
-        return 8;
-    } // instSize
-
-    void
-    Inst_FLAT::generateDisassembly()
-    {
-        std::stringstream dis_stream;
-        dis_stream << _opcode << " ";
-
-        if (isLoad())
-            dis_stream << "v" << extData.VDST << ", ";
-
-        dis_stream << "v[" << extData.ADDR << ":" << extData.ADDR + 1 << "]";
-
-        if (isStore())
-            dis_stream << ", v" << extData.DATA;
-
-        disassembly = dis_stream.str();
-    }
-} // namespace Gcn3ISA
-} // namespace gem5
diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.hh b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
deleted file mode 100644
index 880ccc4503..0000000000
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh
+++ /dev/null
@@ -1,925 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
-#define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
-
-#include "arch/amdgpu/gcn3/gpu_decoder.hh"
-#include "arch/amdgpu/gcn3/gpu_mem_helpers.hh"
-#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
-#include "arch/amdgpu/gcn3/operand.hh"
-#include "debug/GCN3.hh"
-#include "debug/GPUExec.hh"
-#include "mem/ruby/system/RubySystem.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    struct BufferRsrcDescriptor
-    {
-        uint64_t baseAddr : 48;
-        uint32_t stride : 14;
-        uint32_t cacheSwizzle : 1;
-        uint32_t swizzleEn : 1;
-        uint32_t numRecords : 32;
-        uint32_t dstSelX : 3;
-        uint32_t dstSelY : 3;
-        uint32_t dstSelZ : 3;
-        uint32_t dstSelW : 3;
-        uint32_t numFmt : 3;
-        uint32_t dataFmt : 4;
-        uint32_t elemSize : 2;
-        uint32_t idxStride : 2;
-        uint32_t addTidEn : 1;
-        uint32_t atc : 1;
-        uint32_t hashEn : 1;
-        uint32_t heap : 1;
-        uint32_t mType : 3;
-        uint32_t type : 2;
-    };
-
-    // --- purely virtual instruction classes ---
-
-    class Inst_SOP2 : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SOP2(InFmt_SOP2*, const std::string &opcode);
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_SOP2 instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_SOP2 *);
-    }; // Inst_SOP2
-
-    class Inst_SOPK : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SOPK(InFmt_SOPK*, const std::string &opcode);
-        ~Inst_SOPK();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_SOPK instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_SOPK *);
-    }; // Inst_SOPK
-
-    class Inst_SOP1 : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SOP1(InFmt_SOP1*, const std::string &opcode);
-        ~Inst_SOP1();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_SOP1 instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_SOP1 *);
-    }; // Inst_SOP1
-
-    class Inst_SOPC : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SOPC(InFmt_SOPC*, const std::string &opcode);
-        ~Inst_SOPC();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_SOPC instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_SOPC *);
-    }; // Inst_SOPC
-
-    class Inst_SOPP : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SOPP(InFmt_SOPP*, const std::string &opcode);
-        ~Inst_SOPP();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_SOPP instData;
-    }; // Inst_SOPP
-
-    class Inst_SMEM : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_SMEM(InFmt_SMEM*, const std::string &opcode);
-        ~Inst_SMEM();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        /**
-         * initiate a memory read access for N dwords
-         */
-        template<int N>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst)
-        {
-            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
-                                                    MemCmd::ReadReq);
-        }
-
-        /**
-         * initiate a memory write access for N dwords
-         */
-        template<int N>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst)
-        {
-            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
-                                                    MemCmd::WriteReq);
-        }
-
-        /**
-         * For normal s_load_dword/s_store_dword instruction addresses.
-         */
-        void
-        calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr,
-                 ScalarRegU32 offset)
-        {
-            Addr vaddr = ((addr.rawData() + offset) & ~0x3);
-            gpu_dyn_inst->scalarAddr = vaddr;
-        }
-
-        /**
-         * For s_buffer_load_dword/s_buffer_store_dword instruction addresses.
-         * The s_buffer instructions use the same buffer resource descriptor
-         * as the MUBUF instructions.
-         */
-        void
-        calcAddr(GPUDynInstPtr gpu_dyn_inst,
-                 ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)
-        {
-            BufferRsrcDescriptor rsrc_desc;
-            ScalarRegU32 clamped_offset(offset);
-            std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
-                        sizeof(BufferRsrcDescriptor));
-
-            /**
-             * The address is clamped if:
-             *     Stride is zero: clamp if offset >= num_records
-             *     Stride is non-zero: clamp if offset > (stride * num_records)
-             */
-            if (!rsrc_desc.stride && offset >= rsrc_desc.numRecords) {
-                clamped_offset = rsrc_desc.numRecords;
-            } else if (rsrc_desc.stride && offset
-                       > (rsrc_desc.stride * rsrc_desc.numRecords)) {
-                clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);
-            }
-
-            Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);
-            gpu_dyn_inst->scalarAddr = vaddr;
-        }
-
-        // first instruction DWORD
-        InFmt_SMEM instData;
-        // second instruction DWORD
-        InFmt_SMEM_1 extData;
-    }; // Inst_SMEM
-
-    class Inst_VOP2 : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VOP2(InFmt_VOP2*, const std::string &opcode);
-        ~Inst_VOP2();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VOP2 instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_VOP2 *);
-    }; // Inst_VOP2
-
-    class Inst_VOP1 : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VOP1(InFmt_VOP1*, const std::string &opcode);
-        ~Inst_VOP1();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VOP1 instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_VOP1 *);
-    }; // Inst_VOP1
-
-    class Inst_VOPC : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VOPC(InFmt_VOPC*, const std::string &opcode);
-        ~Inst_VOPC();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VOPC instData;
-        // possible second DWORD
-        InstFormat extData;
-        uint32_t varSize;
-
-      private:
-        bool hasSecondDword(InFmt_VOPC *);
-    }; // Inst_VOPC
-
-    class Inst_VINTRP : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);
-        ~Inst_VINTRP();
-
-        int instSize() const override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VINTRP instData;
-    }; // Inst_VINTRP
-
-    class Inst_VOP3 : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VOP3(InFmt_VOP3*, const std::string &opcode, bool sgpr_dst);
-        ~Inst_VOP3();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VOP3 instData;
-        // second instruction DWORD
-        InFmt_VOP3_1 extData;
-
-      private:
-        bool hasSecondDword(InFmt_VOP3 *);
-        /**
-         * the v_cmp and readlane instructions in the VOP3
-         * encoding are unique because they are the only
-         * instructions that use the VDST field to specify
-         * a scalar register destination. for VOP3::V_CMP insts
-         * VDST specifies the arbitrary SGPR pair used to write
-         * VCC. for V_READLANE VDST specifies the SGPR to return
-         * the value of the selected lane in the source VGPR
-         * from which we are reading.
-         */
-        const bool sgprDst;
-    }; // Inst_VOP3
-
-    class Inst_VOP3_SDST_ENC : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_VOP3_SDST_ENC(InFmt_VOP3_SDST_ENC*, const std::string &opcode);
-        ~Inst_VOP3_SDST_ENC();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_VOP3_SDST_ENC instData;
-        // second instruction DWORD
-        InFmt_VOP3_1 extData;
-
-      private:
-        bool hasSecondDword(InFmt_VOP3_SDST_ENC *);
-    }; // Inst_VOP3_SDST_ENC
-
-    class Inst_DS : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_DS(InFmt_DS*, const std::string &opcode);
-        ~Inst_DS();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        template<typename T>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane] + offset;
-
-                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
-                        = wf->ldsChunk->read<T>(vaddr);
-                }
-            }
-        }
-
-        template<int N>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane] + offset;
-                    for (int i = 0; i < N; ++i) {
-                        (reinterpret_cast<VecElemU32*>(
-                            gpuDynInst->d_data))[lane * N + i]
-                            = wf->ldsChunk->read<VecElemU32>(
-                                vaddr + i*sizeof(VecElemU32));
-                    }
-                }
-            }
-        }
-
-        template<typename T>
-        void
-        initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
-                    Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
-
-                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]
-                        = wf->ldsChunk->read<T>(vaddr0);
-                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]
-                        = wf->ldsChunk->read<T>(vaddr1);
-                }
-            }
-        }
-
-        template<typename T>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane] + offset;
-                    wf->ldsChunk->write<T>(vaddr,
-                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
-                }
-            }
-        }
-
-        template<int N>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr = gpuDynInst->addr[lane] + offset;
-                    for (int i = 0; i < N; ++i) {
-                        wf->ldsChunk->write<VecElemU32>(
-                            vaddr + i*sizeof(VecElemU32),
-                            (reinterpret_cast<VecElemU32*>(
-                                gpuDynInst->d_data))[lane * N + i]);
-                    }
-                }
-            }
-        }
-
-        template<typename T>
-        void
-        initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
-                    Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
-                    wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane * 2]);
-                    wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(
-                        gpuDynInst->d_data))[lane * 2 + 1]);
-                }
-            }
-        }
-
-        void
-        calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
-        {
-            Wavefront *wf = gpuDynInst->wavefront();
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    gpuDynInst->addr.at(lane) = (Addr)addr[lane];
-                }
-            }
-        }
-
-        // first instruction DWORD
-        InFmt_DS instData;
-        // second instruction DWORD
-        InFmt_DS_1 extData;
-    }; // Inst_DS
-
-    class Inst_MUBUF : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);
-        ~Inst_MUBUF();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        template<typename T>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst)
-        {
-            // temporarily modify exec_mask to supress memory accesses to oob
-            // regions.  Only issue memory requests for lanes that have their
-            // exec_mask set and are not out of bounds.
-            VectorMask old_exec_mask = gpuDynInst->exec_mask;
-            gpuDynInst->exec_mask &= ~oobMask;
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
-            gpuDynInst->exec_mask = old_exec_mask;
-        }
-
-
-        template<int N>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst)
-        {
-            // temporarily modify exec_mask to supress memory accesses to oob
-            // regions.  Only issue memory requests for lanes that have their
-            // exec_mask set and are not out of bounds.
-            VectorMask old_exec_mask = gpuDynInst->exec_mask;
-            gpuDynInst->exec_mask &= ~oobMask;
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
-            gpuDynInst->exec_mask = old_exec_mask;
-        }
-
-        template<typename T>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst)
-        {
-            // temporarily modify exec_mask to supress memory accesses to oob
-            // regions.  Only issue memory requests for lanes that have their
-            // exec_mask set and are not out of bounds.
-            VectorMask old_exec_mask = gpuDynInst->exec_mask;
-            gpuDynInst->exec_mask &= ~oobMask;
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
-            gpuDynInst->exec_mask = old_exec_mask;
-        }
-
-        template<int N>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst)
-        {
-            // temporarily modify exec_mask to supress memory accesses to oob
-            // regions.  Only issue memory requests for lanes that have their
-            // exec_mask set and are not out of bounds.
-            VectorMask old_exec_mask = gpuDynInst->exec_mask;
-            gpuDynInst->exec_mask &= ~oobMask;
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
-            gpuDynInst->exec_mask = old_exec_mask;
-        }
-
-        void
-        injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
-        {
-            // create request and set flags
-            gpuDynInst->resetEntireStatusVector();
-            gpuDynInst->setStatusVector(0, 1);
-            RequestPtr req = std::make_shared<Request>(0, 0, 0,
-                                       gpuDynInst->computeUnit()->
-                                       requestorId(), 0,
-                                       gpuDynInst->wfDynId);
-            gpuDynInst->setRequestFlags(req);
-            gpuDynInst->computeUnit()->
-                injectGlobalMemFence(gpuDynInst, false, req);
-        }
-
-        /**
-         * MUBUF insructions calculate their addresses as follows:
-         *
-         * index  = (IDXEN ? vgpr_idx : 0) + (const_add_tid_en ? TID : 0)
-         * offset = (OFFEN ? vgpr_off : 0) + inst_off
-         *
-         * / ====================== LINEAR ADDRESSING ====================== /
-         * VADDR = base + sgpr_off + offset + stride * index
-         *
-         * / ===================== SWIZZLED ADDRESSING ===================== /
-         * index_msb  = index / const_index_stride
-         * index_lsb  = index % const_index_stride
-         * offset_msb = offset / const_element_size
-         * offset_lsb = offset % const_element_size
-         * buffer_offset = ((index_msb * stride + offset_msb *
-         *                  const_element_size) * const_index_stride +
-         *                  index_lsb * const_element_size + offset_lsb)
-         *
-         * VADDR = base + sgpr_off + buffer_offset
-         */
-        template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>
-        void
-        calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,
-            SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
-        {
-            Addr vaddr = 0;
-            Addr base_addr = 0;
-            Addr stride = 0;
-            Addr buf_idx = 0;
-            Addr buf_off = 0;
-            Addr buffer_offset = 0;
-            BufferRsrcDescriptor rsrc_desc;
-
-            std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
-                sizeof(BufferRsrcDescriptor));
-
-            base_addr = rsrc_desc.baseAddr;
-
-            stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
-                + rsrc_desc.stride) : rsrc_desc.stride;
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vaddr = base_addr + s_offset.rawData();
-                    /**
-                     * first we calculate the buffer's index and offset.
-                     * these will be used for either linear or swizzled
-                     * buffers.
-                     */
-                    buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
-
-                    buf_off = v_off[lane] + inst_offset;
-
-                    if (rsrc_desc.swizzleEn) {
-                        Addr idx_stride = 8 << rsrc_desc.idxStride;
-                        Addr elem_size = 2 << rsrc_desc.elemSize;
-                        Addr idx_msb = buf_idx / idx_stride;
-                        Addr idx_lsb = buf_idx % idx_stride;
-                        Addr off_msb = buf_off / elem_size;
-                        Addr off_lsb = buf_off % elem_size;
-                        DPRINTF(GCN3, "mubuf swizzled lane %d: "
-                                "idx_stride = %llx, elem_size = %llx, "
-                                "idx_msb = %llx, idx_lsb = %llx, "
-                                "off_msb = %llx, off_lsb = %llx\n",
-                                lane, idx_stride, elem_size, idx_msb, idx_lsb,
-                                off_msb, off_lsb);
-
-                        buffer_offset =(idx_msb * stride + off_msb * elem_size)
-                            * idx_stride + idx_lsb * elem_size + off_lsb;
-                    } else {
-                        buffer_offset = buf_off + stride * buf_idx;
-                    }
-
-
-                    /**
-                     * Range check behavior causes out of range accesses to
-                     * to be treated differently. Out of range accesses return
-                     * 0 for loads and are ignored for stores. For
-                     * non-formatted accesses, this is done on a per-lane
-                     * basis.
-                     */
-                    if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
-                        if (buffer_offset >=
-                            rsrc_desc.numRecords - s_offset.rawData()) {
-                            DPRINTF(GCN3, "mubuf out-of-bounds condition 1: "
-                                    "lane = %d, buffer_offset = %llx, "
-                                    "const_stride = %llx, "
-                                    "const_num_records = %llx\n",
-                                    lane, buf_off + stride * buf_idx,
-                                    rsrc_desc.stride, rsrc_desc.numRecords);
-                            oobMask.set(lane);
-                            continue;
-                        }
-                    }
-
-                    if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
-                        if (buf_idx >= rsrc_desc.numRecords ||
-                            buf_off >= stride) {
-                            DPRINTF(GCN3, "mubuf out-of-bounds condition 2: "
-                                    "lane = %d, offset = %llx, "
-                                    "index = %llx, "
-                                    "const_num_records = %llx\n",
-                                    lane, buf_off, buf_idx,
-                                    rsrc_desc.numRecords);
-                            oobMask.set(lane);
-                            continue;
-                        }
-                    }
-
-                    vaddr += buffer_offset;
-
-                    DPRINTF(GCN3, "Calculating mubuf address for lane %d: "
-                            "vaddr = %llx, base_addr = %llx, "
-                            "stride = %llx, buf_idx = %llx, buf_off = %llx\n",
-                            lane, vaddr, base_addr, stride,
-                            buf_idx, buf_off);
-                    gpuDynInst->addr.at(lane) = vaddr;
-                }
-            }
-        }
-
-        // first instruction DWORD
-        InFmt_MUBUF instData;
-        // second instruction DWORD
-        InFmt_MUBUF_1 extData;
-        // Mask of lanes with out-of-bounds accesses.  Needs to be tracked
-        // seperately from the exec_mask so that we remember to write zero
-        // to the registers associated with out of bounds lanes.
-        VectorMask oobMask;
-    }; // Inst_MUBUF
-
-    class Inst_MTBUF : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);
-        ~Inst_MTBUF();
-
-        int instSize() const override;
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_MTBUF instData;
-        // second instruction DWORD
-        InFmt_MTBUF_1 extData;
-
-      private:
-        bool hasSecondDword(InFmt_MTBUF *);
-    }; // Inst_MTBUF
-
-    class Inst_MIMG : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_MIMG(InFmt_MIMG*, const std::string &opcode);
-        ~Inst_MIMG();
-
-        int instSize() const override;
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_MIMG instData;
-        // second instruction DWORD
-        InFmt_MIMG_1 extData;
-    }; // Inst_MIMG
-
-    class Inst_EXP : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_EXP(InFmt_EXP*, const std::string &opcode);
-        ~Inst_EXP();
-
-        int instSize() const override;
-        void initOperandInfo() override;
-
-      protected:
-        // first instruction DWORD
-        InFmt_EXP instData;
-        // second instruction DWORD
-        InFmt_EXP_1 extData;
-    }; // Inst_EXP
-
-    class Inst_FLAT : public GCN3GPUStaticInst
-    {
-      public:
-        Inst_FLAT(InFmt_FLAT*, const std::string &opcode);
-        ~Inst_FLAT();
-
-        int instSize() const override;
-        void generateDisassembly() override;
-
-        void initOperandInfo() override;
-
-      protected:
-        template<typename T>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst)
-        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
-            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-                Wavefront *wf = gpuDynInst->wavefront();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (gpuDynInst->exec_mask[lane]) {
-                        Addr vaddr = gpuDynInst->addr[lane];
-                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
-                            = wf->ldsChunk->read<T>(vaddr);
-                    }
-                }
-            }
-        }
-
-        template<int N>
-        void
-        initMemRead(GPUDynInstPtr gpuDynInst)
-        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
-            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-                Wavefront *wf = gpuDynInst->wavefront();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (gpuDynInst->exec_mask[lane]) {
-                        Addr vaddr = gpuDynInst->addr[lane];
-                        for (int i = 0; i < N; ++i) {
-                            (reinterpret_cast<VecElemU32*>(
-                                gpuDynInst->d_data))[lane * N + i]
-                                = wf->ldsChunk->read<VecElemU32>(
-                                        vaddr + i*sizeof(VecElemU32));
-                        }
-                    }
-                }
-            }
-        }
-
-        template<typename T>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst)
-        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
-            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-                Wavefront *wf = gpuDynInst->wavefront();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (gpuDynInst->exec_mask[lane]) {
-                        Addr vaddr = gpuDynInst->addr[lane];
-                        wf->ldsChunk->write<T>(vaddr,
-                            (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
-                    }
-                }
-            }
-        }
-
-        template<int N>
-        void
-        initMemWrite(GPUDynInstPtr gpuDynInst)
-        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
-            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-                Wavefront *wf = gpuDynInst->wavefront();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (gpuDynInst->exec_mask[lane]) {
-                        Addr vaddr = gpuDynInst->addr[lane];
-                        for (int i = 0; i < N; ++i) {
-                            wf->ldsChunk->write<VecElemU32>(
-                                vaddr + i*sizeof(VecElemU32),
-                                (reinterpret_cast<VecElemU32*>(
-                                    gpuDynInst->d_data))[lane * N + i]);
-                        }
-                    }
-                }
-            }
-        }
-
-        template<typename T>
-        void
-        initAtomicAccess(GPUDynInstPtr gpuDynInst)
-        {
-            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
-            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
-                Wavefront *wf = gpuDynInst->wavefront();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (gpuDynInst->exec_mask[lane]) {
-                        Addr vaddr = gpuDynInst->addr[lane];
-                        auto amo_op =
-                            gpuDynInst->makeAtomicOpFunctor<T>(
-                                &(reinterpret_cast<T*>(
-                                    gpuDynInst->a_data))[lane],
-                                &(reinterpret_cast<T*>(
-                                    gpuDynInst->x_data))[lane]);
-
-                        T tmp = wf->ldsChunk->read<T>(vaddr);
-                        (*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
-                        wf->ldsChunk->write<T>(vaddr, tmp);
-                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
-                    }
-                }
-            }
-        }
-
-        void
-        calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr)
-        {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    gpuDynInst->addr.at(lane) = addr[lane];
-                }
-            }
-            gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
-        }
-
-        // first instruction DWORD
-        InFmt_FLAT instData;
-        // second instruction DWORD
-        InFmt_FLAT_1 extData;
-    }; // Inst_FLAT
-} // namespace Gcn3ISA
-} // namespace gem5
-
-#endif // __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
diff --git a/src/arch/amdgpu/gcn3/isa.cc b/src/arch/amdgpu/gcn3/isa.cc
deleted file mode 100644
index 385a0f0901..0000000000
--- a/src/arch/amdgpu/gcn3/isa.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "arch/amdgpu/gcn3/gpu_isa.hh"
-
-#include <numeric>
-
-#include "gpu-compute/gpu_static_inst.hh"
-#include "gpu-compute/wavefront.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    GPUISA::GPUISA(Wavefront &wf) : wavefront(wf), m0(0)
-    {
-    }
-
-    ScalarRegU32
-    GPUISA::readMiscReg(int opIdx) const
-    {
-        switch (opIdx) {
-          case REG_M0:
-            return m0;
-          case REG_ZERO:
-            return 0;
-          case REG_SCC:
-            return statusReg.SCC;
-          default:
-            fatal("attempting to read from unsupported or non-readable "
-                  "register. selector val: %i\n", opIdx);
-            return 0;
-        }
-    }
-
-    void
-    GPUISA::writeMiscReg(int opIdx, ScalarRegU32 operandVal)
-    {
-        switch (opIdx) {
-          case REG_M0:
-            m0 = operandVal;
-            break;
-          case REG_SCC:
-            statusReg.SCC = operandVal ? 1 : 0;
-            break;
-          default:
-            fatal("attempting to write to an unsupported or non-writable "
-                  "register. selector val: %i\n", opIdx);
-            break;
-        }
-    }
-
-    void
-    GPUISA::advancePC(GPUDynInstPtr gpuDynInst)
-    {
-        wavefront.pc(wavefront.pc()
-                     + gpuDynInst->staticInstruction()->instSize());
-    }
-
-    const std::array<const ScalarRegU32, NumPosConstRegs>
-        GPUISA::posConstRegs = { {
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-            20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
-            37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
-            54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
-        } };
-
-    const std::array<const ScalarRegI32, NumNegConstRegs>
-        GPUISA::negConstRegs = { {
-            -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
-            -16
-        } };
-} // namespace Gcn3ISA
-} // namespace gem5
diff --git a/src/arch/amdgpu/gcn3/operand.hh b/src/arch/amdgpu/gcn3/operand.hh
deleted file mode 100644
index 769f28a8a8..0000000000
--- a/src/arch/amdgpu/gcn3/operand.hh
+++ /dev/null
@@ -1,752 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ARCH_GCN3_OPERAND_HH__
-#define __ARCH_GCN3_OPERAND_HH__
-
-#include <array>
-
-#include "arch/amdgpu/gcn3/gpu_registers.hh"
-#include "arch/generic/vec_reg.hh"
-#include "gpu-compute/scalar_register_file.hh"
-#include "gpu-compute/vector_register_file.hh"
-#include "gpu-compute/wavefront.hh"
-
-namespace gem5
-{
-
-/**
- * classes that represnt vector/scalar operands in GCN3 ISA. these classes
- * wrap the generic vector register type (i.e., src/arch/generic/vec_reg.hh)
- * and allow them to be manipulated in ways that are unique to GCN3 insts.
- */
-
-namespace Gcn3ISA
-{
-    /**
-     * convenience traits so we can automatically infer the correct FP type
-     * without looking at the number of dwords (i.e., to determine if we
-     * need a float or a double when creating FP constants).
-     */
-    template<typename T> struct OpTraits { typedef float FloatT; };
-    template<> struct OpTraits<ScalarRegF64> { typedef double FloatT; };
-    template<> struct OpTraits<ScalarRegU64> { typedef double FloatT; };
-
-    class Operand
-    {
-      public:
-        Operand() = delete;
-
-        Operand(GPUDynInstPtr gpuDynInst, int opIdx)
-            : _gpuDynInst(gpuDynInst), _opIdx(opIdx)
-        {
-            assert(_gpuDynInst);
-            assert(_opIdx >= 0);
-        }
-
-        /**
-         * read from and write to the underlying register(s) that
-         * this operand is referring to.
-         */
-        virtual void read() = 0;
-        virtual void write() = 0;
-
-      protected:
-        /**
-         * instruction object that owns this operand
-         */
-        GPUDynInstPtr _gpuDynInst;
-        /**
-         * op selector value for this operand. note that this is not
-         * the same as the register file index, be it scalar or vector.
-         * this could refer to inline constants, system regs, or even
-         * special values.
-         */
-        int _opIdx;
-    };
-
-    template<typename DataType, bool Const, size_t NumDwords>
-    class ScalarOperand;
-
-    template<typename DataType, bool Const,
-        size_t NumDwords = sizeof(DataType) / sizeof(VecElemU32)>
-    class VecOperand final : public Operand
-    {
-      static_assert(NumDwords >= 1 && NumDwords <= MaxOperandDwords,
-            "Incorrect number of DWORDS for GCN3 operand.");
-
-      public:
-        VecOperand() = delete;
-
-        VecOperand(GPUDynInstPtr gpuDynInst, int opIdx)
-            : Operand(gpuDynInst, opIdx), scalar(false), absMod(false),
-              negMod(false), scRegData(gpuDynInst, _opIdx),
-              vrfData{{ nullptr }}
-        {
-            vecReg.zero();
-        }
-
-        ~VecOperand()
-        {
-        }
-
-        /**
-         * certain vector operands can read from the vrf/srf or constants.
-         * we use this method to first determine the type of the operand,
-         * then we read from the appropriate source. if vector we read
-         * directly from the vrf. if scalar, we read in the data through
-         * the scalar operand component. this should only be used for VSRC
-         * operands.
-         */
-        void
-        readSrc()
-        {
-            if (isVectorReg(_opIdx)) {
-                _opIdx = opSelectorToRegIdx(_opIdx, _gpuDynInst->wavefront()
-                    ->reservedScalarRegs);
-                read();
-            } else {
-                readScalar();
-            }
-        }
-
-        /**
-         * read from the vrf. this should only be used by vector inst
-         * source operands that are explicitly vector (i.e., VSRC).
-         */
-        void
-        read() override
-        {
-            assert(_gpuDynInst);
-            assert(_gpuDynInst->wavefront());
-            assert(_gpuDynInst->computeUnit());
-            Wavefront *wf = _gpuDynInst->wavefront();
-            ComputeUnit *cu = _gpuDynInst->computeUnit();
-
-            for (auto i = 0; i < NumDwords; ++i) {
-                int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx + i);
-                vrfData[i] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
-
-                DPRINTF(GPUVRF, "Read v[%d]\n", vgprIdx);
-                cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
-            }
-
-            if (NumDwords == 1) {
-                assert(vrfData[0]);
-                auto vgpr = vecReg.template as<DataType>();
-                auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    std::memcpy((void*)&vgpr[lane],
-                        (void*)&reg_file_vgpr[lane], sizeof(DataType));
-                }
-            } else if (NumDwords == 2) {
-                assert(vrfData[0]);
-                assert(vrfData[1]);
-                auto vgpr = vecReg.template as<VecElemU64>();
-                auto reg_file_vgpr0 = vrfData[0]->template as<VecElemU32>();
-                auto reg_file_vgpr1 = vrfData[1]->template as<VecElemU32>();
-
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    VecElemU64 tmp_val(0);
-                    ((VecElemU32*)&tmp_val)[0] = reg_file_vgpr0[lane];
-                    ((VecElemU32*)&tmp_val)[1] = reg_file_vgpr1[lane];
-                    vgpr[lane] = tmp_val;
-                }
-            }
-        }
-
-        /**
-         * write to the vrf. we maintain a copy of the underlying vector
-         * reg(s) for this operand (i.e., vrfData/scRegData), as well as a
-         * temporary vector register representation (i.e., vecReg) of the
-         * vector register, which allows the execute() methods of instructions
-         * to easily write their operand data using operator[] regardless of
-         * their size. after the result is calculated we use write() to write
-         * the data to the actual register file storage. this allows us to do
-         * type conversion, etc., in a single call as opposed to doing it
-         * in each execute() method.
-         */
-        void
-        write() override
-        {
-            assert(_gpuDynInst);
-            assert(_gpuDynInst->wavefront());
-            assert(_gpuDynInst->computeUnit());
-            Wavefront *wf = _gpuDynInst->wavefront();
-            ComputeUnit *cu = _gpuDynInst->computeUnit();
-            VectorMask &exec_mask = _gpuDynInst->isLoad()
-                ? _gpuDynInst->exec_mask : wf->execMask();
-
-            if (NumDwords == 1) {
-                int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx);
-                vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
-                assert(vrfData[0]);
-                auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
-                auto vgpr = vecReg.template as<DataType>();
-
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (exec_mask[lane] || _gpuDynInst->ignoreExec()) {
-                        std::memcpy((void*)&reg_file_vgpr[lane],
-                            (void*)&vgpr[lane], sizeof(DataType));
-                    }
-                }
-
-                DPRINTF(GPUVRF, "Write v[%d]\n", vgprIdx);
-                cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
-            } else if (NumDwords == 2) {
-                int vgprIdx0 = cu->registerManager->mapVgpr(wf, _opIdx);
-                int vgprIdx1 = cu->registerManager->mapVgpr(wf, _opIdx + 1);
-                vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx0);
-                vrfData[1] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx1);
-                assert(vrfData[0]);
-                assert(vrfData[1]);
-                auto reg_file_vgpr0 = vrfData[0]->template as<VecElemU32>();
-                auto reg_file_vgpr1 = vrfData[1]->template as<VecElemU32>();
-                auto vgpr = vecReg.template as<VecElemU64>();
-
-                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                    if (exec_mask[lane] || _gpuDynInst->ignoreExec()) {
-                        reg_file_vgpr0[lane] = ((VecElemU32*)&vgpr[lane])[0];
-                        reg_file_vgpr1[lane] = ((VecElemU32*)&vgpr[lane])[1];
-                    }
-                }
-
-                DPRINTF(GPUVRF, "Write v[%d:%d]\n", vgprIdx0, vgprIdx1);
-                cu->vrf[wf->simdId]->printReg(wf, vgprIdx0);
-                cu->vrf[wf->simdId]->printReg(wf, vgprIdx1);
-            }
-        }
-
-        void
-        negModifier()
-        {
-            negMod = true;
-        }
-
-        void
-        absModifier()
-        {
-            absMod = true;
-        }
-
-        /**
-         * getter [] operator. only enable if this operand is constant
-         * (i.e, a source operand) and if it can be represented using
-         * primitive types (i.e., 8b to 64b primitives).
-         */
-        template<bool Condition = (NumDwords == 1 || NumDwords == 2) && Const>
-        typename std::enable_if_t<Condition, const DataType>
-        operator[](size_t idx) const
-        {
-            assert(idx < NumVecElemPerVecReg);
-
-            if (scalar) {
-                DataType ret_val = scRegData.rawData();
-
-                if (absMod) {
-                    assert(std::is_floating_point_v<DataType>);
-                    ret_val = std::fabs(ret_val);
-                }
-
-                if (negMod) {
-                    assert(std::is_floating_point_v<DataType>);
-                    ret_val = -ret_val;
-                }
-
-                return ret_val;
-            } else {
-                auto vgpr = vecReg.template as<DataType>();
-                DataType ret_val = vgpr[idx];
-
-                if (absMod) {
-                    assert(std::is_floating_point_v<DataType>);
-                    ret_val = std::fabs(ret_val);
-                }
-
-                if (negMod) {
-                    assert(std::is_floating_point_v<DataType>);
-                    ret_val = -ret_val;
-                }
-
-                return ret_val;
-            }
-        }
-
-        /**
-         * setter [] operator. only enable if this operand is non-constant
-         * (i.e, a destination operand) and if it can be represented using
-         * primitive types (i.e., 8b to 64b primitives).
-         */
-        template<bool Condition = (NumDwords == 1 || NumDwords == 2) && !Const>
-        typename std::enable_if_t<Condition, DataType&>
-        operator[](size_t idx)
-        {
-            assert(!scalar);
-            assert(idx < NumVecElemPerVecReg);
-
-            return vecReg.template as<DataType>()[idx];
-        }
-
-        private:
-          /**
-           * if we determine that this operand is a scalar (reg or constant)
-           * then we read the scalar data into the scalar operand data member.
-           */
-          void
-          readScalar()
-          {
-              scalar = true;
-              scRegData.read();
-          }
-
-          using VecRegCont =
-              VecRegContainer<sizeof(DataType) * NumVecElemPerVecReg>;
-
-          /**
-           * whether this operand a scalar or not.
-           */
-          bool scalar;
-          /**
-           * absolute value and negative modifiers. VOP3 instructions
-           * may indicate that their input/output operands must be
-           * modified, either by taking the absolute value or negating
-           * them. these bools indicate which modifier, if any, to use.
-           */
-          bool absMod;
-          bool negMod;
-          /**
-           * this holds all the operand data in a single vector register
-           * object (i.e., if an operand is 64b, this will hold the data
-           * from both registers the operand is using).
-           */
-          VecRegCont vecReg;
-          /**
-           * for src operands that read scalars (i.e., scalar regs or
-           * a scalar constant).
-           */
-          ScalarOperand<DataType, Const, NumDwords> scRegData;
-          /**
-           * pointers to the underlyding registers (i.e., the actual
-           * registers in the register file).
-           */
-          std::array<VecRegContainerU32*, NumDwords> vrfData;
-    };
-
-    template<typename DataType, bool Const,
-        size_t NumDwords = sizeof(DataType) / sizeof(ScalarRegU32)>
-    class ScalarOperand final : public Operand
-    {
-      static_assert(NumDwords >= 1 && NumDwords <= MaxOperandDwords,
-            "Incorrect number of DWORDS for GCN3 operand.");
-      public:
-        ScalarOperand() = delete;
-
-        ScalarOperand(GPUDynInstPtr gpuDynInst, int opIdx)
-            : Operand(gpuDynInst, opIdx)
-        {
-            std::memset(srfData.data(), 0, NumDwords * sizeof(ScalarRegU32));
-        }
-
-        ~ScalarOperand()
-        {
-        }
-
-        /**
-         * we store scalar data in a std::array, however if we need the
-         * full operand data we use this method to copy all elements of
-         * the scalar operand data to a single primitive container. only
-         * useful for 8b to 64b primitive types, as they are the only types
-         * that we need to perform computation on.
-         */
-        template<bool Condition = NumDwords == 1 || NumDwords == 2>
-        typename std::enable_if_t<Condition, DataType>
-        rawData() const
-        {
-            assert(sizeof(DataType) <= sizeof(srfData));
-            DataType raw_data((DataType)0);
-            std::memcpy((void*)&raw_data, (void*)srfData.data(),
-                sizeof(DataType));
-
-            return raw_data;
-        }
-
-        void*
-        rawDataPtr()
-        {
-            return (void*)srfData.data();
-        }
-
-        void
-        read() override
-        {
-            Wavefront *wf = _gpuDynInst->wavefront();
-            ComputeUnit *cu = _gpuDynInst->computeUnit();
-
-            if (!isScalarReg(_opIdx)) {
-                readSpecialVal();
-            } else {
-                for (auto i = 0; i < NumDwords; ++i) {
-                    int sgprIdx = regIdx(i);
-                    srfData[i] = cu->srf[wf->simdId]->read(sgprIdx);
-                    DPRINTF(GPUSRF, "Read s[%d]\n", sgprIdx);
-                    cu->srf[wf->simdId]->printReg(wf, sgprIdx);
-                }
-            }
-        }
-
-        void
-        write() override
-        {
-            Wavefront *wf = _gpuDynInst->wavefront();
-            ComputeUnit *cu = _gpuDynInst->computeUnit();
-
-            if (!isScalarReg(_opIdx)) {
-                if (_opIdx == REG_EXEC_LO) {
-                    ScalarRegU64 new_exec_mask_val
-                        = wf->execMask().to_ullong();
-                    if (NumDwords == 1) {
-                        std::memcpy((void*)&new_exec_mask_val,
-                            (void*)srfData.data(), sizeof(VecElemU32));
-                    } else if (NumDwords == 2) {
-                        std::memcpy((void*)&new_exec_mask_val,
-                            (void*)srfData.data(), sizeof(VecElemU64));
-                    } else {
-                        panic("Trying to write more than 2 DWORDS to EXEC\n");
-                    }
-                    VectorMask new_exec_mask(new_exec_mask_val);
-                    wf->execMask() = new_exec_mask;
-                    DPRINTF(GPUSRF, "Write EXEC\n");
-                    DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
-                } else if (_opIdx == REG_EXEC_HI) {
-                    /**
-                     * If we're writing only the upper half of the EXEC mask
-                     * this ought to be a single dword operand.
-                     */
-                    assert(NumDwords == 1);
-                    ScalarRegU32 new_exec_mask_hi_val(0);
-                    ScalarRegU64 new_exec_mask_val
-                        = wf->execMask().to_ullong();
-                    std::memcpy((void*)&new_exec_mask_hi_val,
-                        (void*)srfData.data(), sizeof(new_exec_mask_hi_val));
-                    replaceBits(new_exec_mask_val, 63, 32,
-                                new_exec_mask_hi_val);
-                    VectorMask new_exec_mask(new_exec_mask_val);
-                    wf->execMask() = new_exec_mask;
-                    DPRINTF(GPUSRF, "Write EXEC\n");
-                    DPRINTF(GPUSRF, "EXEC = %#x\n", new_exec_mask_val);
-                } else {
-                    _gpuDynInst->writeMiscReg(_opIdx, srfData[0]);
-                }
-            } else {
-                for (auto i = 0; i < NumDwords; ++i) {
-                    int sgprIdx = regIdx(i);
-                    auto &sgpr = cu->srf[wf->simdId]->readWriteable(sgprIdx);
-                    if (_gpuDynInst->isLoad()) {
-                        assert(sizeof(DataType) <= sizeof(ScalarRegU64));
-                        sgpr = reinterpret_cast<ScalarRegU32*>(
-                            _gpuDynInst->scalar_data)[i];
-                    } else {
-                        sgpr = srfData[i];
-                    }
-                    DPRINTF(GPUSRF, "Write s[%d]\n", sgprIdx);
-                    cu->srf[wf->simdId]->printReg(wf, sgprIdx);
-                }
-            }
-        }
-
-        /**
-         * bit access to scalar data. primarily used for setting vcc bits.
-         */
-        template<bool Condition = NumDwords == 1 || NumDwords == 2>
-        typename std::enable_if_t<Condition, void>
-        setBit(int bit, int bit_val)
-        {
-            DataType &sgpr = *((DataType*)srfData.data());
-            replaceBits(sgpr, bit, bit_val);
-        }
-
-        template<bool Condition = (NumDwords == 1 || NumDwords == 2) && !Const>
-        typename std::enable_if_t<Condition, ScalarOperand&>
-        operator=(DataType rhs)
-        {
-            std::memcpy((void*)srfData.data(), (void*)&rhs, sizeof(DataType));
-            return *this;
-        }
-
-      private:
-        /**
-         * we have determined that we are not reading our scalar operand data
-         * from the register file, so here we figure out which special value
-         * we are reading (i.e., float constant, int constant, inline
-         * constant, or various other system registers (e.g., exec mask).
-         */
-        void
-        readSpecialVal()
-        {
-            assert(NumDwords == 1 || NumDwords == 2);
-
-            switch(_opIdx) {
-              case REG_EXEC_LO:
-                {
-                    if (NumDwords == 1) {
-                        ScalarRegU32 exec_mask = _gpuDynInst->wavefront()->
-                            execMask().to_ulong();
-                        std::memcpy((void*)srfData.data(), (void*)&exec_mask,
-                            sizeof(exec_mask));
-                        DPRINTF(GPUSRF, "Read EXEC\n");
-                        DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
-                    } else {
-                        assert(NumDwords == 2);
-                        ScalarRegU64 exec_mask = _gpuDynInst->wavefront()->
-                            execMask().to_ullong();
-                        std::memcpy((void*)srfData.data(), (void*)&exec_mask,
-                            sizeof(exec_mask));
-                        DPRINTF(GPUSRF, "Read EXEC\n");
-                        DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
-                    }
-                }
-                break;
-              case REG_EXEC_HI:
-                {
-                    /**
-                     * If we're reading only the upper half of the EXEC mask
-                     * this ought to be a single dword operand.
-                     */
-                    assert(NumDwords == 1);
-                    ScalarRegU64 exec_mask = _gpuDynInst->wavefront()
-                        ->execMask().to_ullong();
-
-                    ScalarRegU32 exec_mask_hi = bits(exec_mask, 63, 32);
-                    std::memcpy((void*)srfData.data(), (void*)&exec_mask_hi,
-                                sizeof(exec_mask_hi));
-                    DPRINTF(GPUSRF, "Read EXEC_HI\n");
-                    DPRINTF(GPUSRF, "EXEC_HI = %#x\n", exec_mask_hi);
-                }
-                break;
-              case REG_SRC_SWDA:
-              case REG_SRC_DPP:
-              case REG_SRC_LITERAL:
-                assert(NumDwords == 1);
-                srfData[0] = _gpuDynInst->srcLiteral();
-                break;
-              case REG_POS_HALF:
-                {
-                    typename OpTraits<DataType>::FloatT pos_half = 0.5;
-                    std::memcpy((void*)srfData.data(), (void*)&pos_half,
-                        sizeof(pos_half));
-
-                }
-                break;
-              case REG_NEG_HALF:
-                {
-                    typename OpTraits<DataType>::FloatT neg_half = -0.5;
-                    std::memcpy((void*)srfData.data(), (void*)&neg_half,
-                        sizeof(neg_half));
-                }
-                break;
-              case REG_POS_ONE:
-                {
-                    typename OpTraits<DataType>::FloatT pos_one = 1.0;
-                    std::memcpy(srfData.data(), &pos_one, sizeof(pos_one));
-                }
-                break;
-              case REG_NEG_ONE:
-                {
-                    typename OpTraits<DataType>::FloatT neg_one = -1.0;
-                    std::memcpy(srfData.data(), &neg_one, sizeof(neg_one));
-                }
-                break;
-              case REG_POS_TWO:
-                {
-                    typename OpTraits<DataType>::FloatT pos_two = 2.0;
-                    std::memcpy(srfData.data(), &pos_two, sizeof(pos_two));
-                }
-                break;
-              case REG_NEG_TWO:
-                {
-                    typename OpTraits<DataType>::FloatT neg_two = -2.0;
-                    std::memcpy(srfData.data(), &neg_two, sizeof(neg_two));
-                }
-                break;
-              case REG_POS_FOUR:
-                {
-                    typename OpTraits<DataType>::FloatT pos_four = 4.0;
-                    std::memcpy(srfData.data(), &pos_four, sizeof(pos_four));
-                }
-                break;
-              case REG_NEG_FOUR:
-                {
-                    typename OpTraits<DataType>::FloatT neg_four = -4.0;
-                    std::memcpy((void*)srfData.data(), (void*)&neg_four ,
-                        sizeof(neg_four));
-                }
-                break;
-                case REG_PI:
-                {
-                    assert(sizeof(DataType) == sizeof(ScalarRegF64)
-                        || sizeof(DataType) == sizeof(ScalarRegF32));
-
-                    const ScalarRegU32 pi_u32(0x3e22f983UL);
-                    const ScalarRegU64 pi_u64(0x3fc45f306dc9c882ULL);
-
-                    if (sizeof(DataType) == sizeof(ScalarRegF64)) {
-                        std::memcpy((void*)srfData.data(),
-                            (void*)&pi_u64, sizeof(pi_u64));
-                    } else {
-                        std::memcpy((void*)srfData.data(),
-                            (void*)&pi_u32, sizeof(pi_u32));
-                    }
-                }
-                break;
-              default:
-                {
-                    assert(sizeof(DataType) <= sizeof(srfData));
-                    DataType misc_val(0);
-                    if (isConstVal(_opIdx)) {
-                        misc_val = (DataType)_gpuDynInst
-                            ->readConstVal<DataType>(_opIdx);
-                    } else {
-                        misc_val = (DataType)_gpuDynInst->readMiscReg(_opIdx);
-                    }
-                    std::memcpy((void*)srfData.data(), (void*)&misc_val,
-                                sizeof(DataType));
-                }
-            }
-        }
-
-        /**
-         * for scalars we need to do some extra work to figure out how to
-         * map the op selector to the sgpr idx because some op selectors
-         * do not map directly to the srf (i.e., vcc/flat_scratch).
-         */
-        int
-        regIdx(int dword) const
-        {
-            Wavefront *wf = _gpuDynInst->wavefront();
-            ComputeUnit *cu = _gpuDynInst->computeUnit();
-            int sgprIdx(-1);
-
-            if (_opIdx == REG_VCC_HI) {
-                sgprIdx = cu->registerManager
-                    ->mapSgpr(wf, wf->reservedScalarRegs - 1 + dword);
-            } else if (_opIdx == REG_VCC_LO) {
-                sgprIdx = cu->registerManager
-                    ->mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
-            } else if (_opIdx == REG_FLAT_SCRATCH_HI) {
-                sgprIdx = cu->registerManager
-                    ->mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
-            } else if (_opIdx == REG_FLAT_SCRATCH_LO) {
-                assert(NumDwords == 1);
-                sgprIdx = cu->registerManager
-                    ->mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
-            } else {
-                sgprIdx = cu->registerManager->mapSgpr(wf, _opIdx + dword);
-            }
-
-            assert(sgprIdx > -1);
-
-            return sgprIdx;
-        }
-
-        /**
-         * in GCN3 each register is represented as a 32b unsigned value,
-         * however operands may require up to 16 registers, so we store
-         * all the individual 32b components here. for sub-dword operand
-         * we still consider them to be 1 dword because the minimum size
-         * of a register is 1 dword. this class will take care to do the
-         * proper packing/unpacking of sub-dword operands.
-         */
-        std::array<ScalarRegU32, NumDwords> srfData;
-    };
-
-    // typedefs for the various sizes/types of scalar operands
-    using ScalarOperandU8 = ScalarOperand<ScalarRegU8, false, 1>;
-    using ScalarOperandI8 = ScalarOperand<ScalarRegI8, false, 1>;
-    using ScalarOperandU16 = ScalarOperand<ScalarRegU16, false, 1>;
-    using ScalarOperandI16 = ScalarOperand<ScalarRegI16, false, 1>;
-    using ScalarOperandU32 = ScalarOperand<ScalarRegU32, false>;
-    using ScalarOperandI32 = ScalarOperand<ScalarRegI32, false>;
-    using ScalarOperandF32 = ScalarOperand<ScalarRegF32, false>;
-    using ScalarOperandU64 = ScalarOperand<ScalarRegU64, false>;
-    using ScalarOperandI64 = ScalarOperand<ScalarRegI64, false>;
-    using ScalarOperandF64 = ScalarOperand<ScalarRegF64, false>;
-    using ScalarOperandU128 = ScalarOperand<ScalarRegU32, false, 4>;
-    using ScalarOperandU256 = ScalarOperand<ScalarRegU32, false, 8>;
-    using ScalarOperandU512 = ScalarOperand<ScalarRegU32, false, 16>;
-    // non-writeable versions of scalar operands
-    using ConstScalarOperandU8 = ScalarOperand<ScalarRegU8, true, 1>;
-    using ConstScalarOperandI8 = ScalarOperand<ScalarRegI8, true, 1>;
-    using ConstScalarOperandU16 = ScalarOperand<ScalarRegU16, true, 1>;
-    using ConstScalarOperandI16 = ScalarOperand<ScalarRegI16, true, 1>;
-    using ConstScalarOperandU32 = ScalarOperand<ScalarRegU32, true>;
-    using ConstScalarOperandI32 = ScalarOperand<ScalarRegI32, true>;
-    using ConstScalarOperandF32 = ScalarOperand<ScalarRegF32, true>;
-    using ConstScalarOperandU64 = ScalarOperand<ScalarRegU64, true>;
-    using ConstScalarOperandI64 = ScalarOperand<ScalarRegI64, true>;
-    using ConstScalarOperandF64 = ScalarOperand<ScalarRegF64, true>;
-    using ConstScalarOperandU128 = ScalarOperand<ScalarRegU32, true, 4>;
-    using ConstScalarOperandU256 = ScalarOperand<ScalarRegU32, true, 8>;
-    using ConstScalarOperandU512 = ScalarOperand<ScalarRegU32, true, 16>;
-    // typedefs for the various sizes/types of vector operands
-    using VecOperandU8 = VecOperand<VecElemU8, false, 1>;
-    using VecOperandI8 = VecOperand<VecElemI8, false, 1>;
-    using VecOperandU16 = VecOperand<VecElemU16, false, 1>;
-    using VecOperandI16 = VecOperand<VecElemI16, false, 1>;
-    using VecOperandU32 = VecOperand<VecElemU32, false>;
-    using VecOperandI32 = VecOperand<VecElemI32, false>;
-    using VecOperandF32 = VecOperand<VecElemF32, false>;
-    using VecOperandU64 = VecOperand<VecElemU64, false>;
-    using VecOperandF64 = VecOperand<VecElemF64, false>;
-    using VecOperandI64 = VecOperand<VecElemI64, false>;
-    using VecOperandU96 = VecOperand<VecElemU32, false, 3>;
-    using VecOperandU128 = VecOperand<VecElemU32, false, 4>;
-    using VecOperandU256 = VecOperand<VecElemU32, false, 8>;
-    using VecOperandU512 = VecOperand<VecElemU32, false, 16>;
-    // non-writeable versions of vector operands
-    using ConstVecOperandU8 = VecOperand<VecElemU8, true, 1>;
-    using ConstVecOperandI8 = VecOperand<VecElemI8, true, 1>;
-    using ConstVecOperandU16 = VecOperand<VecElemU16, true, 1>;
-    using ConstVecOperandI16 = VecOperand<VecElemI16, true, 1>;
-    using ConstVecOperandU32 = VecOperand<VecElemU32, true>;
-    using ConstVecOperandI32 = VecOperand<VecElemI32, true>;
-    using ConstVecOperandF32 = VecOperand<VecElemF32, true>;
-    using ConstVecOperandU64 = VecOperand<VecElemU64, true>;
-    using ConstVecOperandI64 = VecOperand<VecElemI64, true>;
-    using ConstVecOperandF64 = VecOperand<VecElemF64, true>;
-    using ConstVecOperandU96 = VecOperand<VecElemU32, true, 3>;
-    using ConstVecOperandU128 = VecOperand<VecElemU32, true, 4>;
-    using ConstVecOperandU256 = VecOperand<VecElemU32, true, 8>;
-    using ConstVecOperandU512 = VecOperand<VecElemU32, true, 16>;
-}
-
-} // namespace gem5
-
-#endif // __ARCH_GCN3_OPERAND_HH__
diff --git a/src/arch/amdgpu/gcn3/registers.cc b/src/arch/amdgpu/gcn3/registers.cc
deleted file mode 100644
index 7f1d0dba37..0000000000
--- a/src/arch/amdgpu/gcn3/registers.cc
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "arch/amdgpu/gcn3/gpu_registers.hh"
-
-namespace gem5
-{
-
-namespace Gcn3ISA
-{
-    std::string
-    opSelectorToRegSym(int idx, int numRegs)
-    {
-        std::string reg_sym;
-
-        // we have an SGPR
-        if (idx <= REG_SGPR_MAX) {
-            if (numRegs > 1)
-                reg_sym = "s[" + std::to_string(idx) + ":" +
-                    std::to_string(idx + numRegs - 1) + "]";
-            else
-                reg_sym = "s" + std::to_string(idx);
-            return reg_sym;
-        } else if (idx >= REG_VGPR_MIN && idx <= REG_VGPR_MAX) {
-            if (numRegs > 1)
-                reg_sym = "v[" + std::to_string(idx - REG_VGPR_MIN) + ":" +
-                    std::to_string(idx - REG_VGPR_MIN + numRegs - 1) + "]";
-            else
-                reg_sym = "v" + std::to_string(idx - REG_VGPR_MIN);
-            return reg_sym;
-        } else if (idx >= REG_INT_CONST_POS_MIN &&
-                   idx <= REG_INT_CONST_POS_MAX) {
-            reg_sym = std::to_string(idx - REG_INT_CONST_POS_MIN + 1);
-            return reg_sym;
-        } else if (idx >= REG_INT_CONST_NEG_MIN &&
-                   idx <= REG_INT_CONST_NEG_MAX) {
-            int inline_val = -1 - (idx - REG_INT_CONST_NEG_MIN);
-            reg_sym = std::to_string(inline_val);
-            return reg_sym;
-        }
-
-        switch (idx) {
-          case REG_FLAT_SCRATCH_LO:
-            reg_sym = "flat_scratch_lo";
-            break;
-          case REG_FLAT_SCRATCH_HI:
-            reg_sym = "flat_scratch_hi";
-            break;
-          case REG_VCC_LO:
-            reg_sym = "vcc_lo";
-            break;
-          case REG_VCC_HI:
-            reg_sym = "vcc_hi";
-            break;
-          case REG_M0:
-            reg_sym = "m0";
-            break;
-          case REG_EXEC_LO:
-            reg_sym = "exec";
-            break;
-          case REG_ZERO:
-            reg_sym = "0";
-            break;
-          case REG_POS_HALF:
-            reg_sym = "0.5";
-            break;
-          case REG_NEG_HALF:
-            reg_sym = "-0.5";
-            break;
-          case REG_POS_ONE:
-            reg_sym = "1";
-            break;
-          case REG_NEG_ONE:
-            reg_sym = "-1";
-            break;
-          case REG_POS_TWO:
-            reg_sym = "2";
-            break;
-          case REG_NEG_TWO:
-            reg_sym = "-2";
-            break;
-          case REG_POS_FOUR:
-            reg_sym = "4";
-            break;
-          case REG_NEG_FOUR:
-            reg_sym = "-4";
-            break;
-          default:
-            fatal("GCN3 ISA instruction has unknown register index %u\n", idx);
-            break;
-        }
-
-        return reg_sym;
-    }
-
-    int
-    opSelectorToRegIdx(int idx, int numScalarRegs)
-    {
-        int regIdx = -1;
-
-        if (idx <= REG_SGPR_MAX) {
-            regIdx = idx;
-        } else if (idx >= REG_VGPR_MIN && idx <= REG_VGPR_MAX) {
-            regIdx = idx - REG_VGPR_MIN;
-        } else if (idx == REG_VCC_LO) {
-            /**
-             * the VCC register occupies the two highest numbered
-             * SRF entries. VCC is typically indexed by specifying
-             * VCC_LO (simply called VCC) in the instruction encoding
-             * and reading it as a 64b value so we only return the
-             * index to the lower half of the VCC register.
-             *
-             * VCC_LO = s[NUM_SGPRS - 2]
-             * VCC_HI = s[NUM_SGPRS - 1]
-             *
-             */
-            regIdx = numScalarRegs - 2;
-        } else if (idx == REG_VCC_HI) {
-            regIdx = numScalarRegs - 1;
-        } else if (idx == REG_FLAT_SCRATCH_LO) {
-            /**
-             * the FLAT_SCRATCH register occupies the two SRF entries
-             * just below VCC. FLAT_SCRATCH is typically indexed by
-             * specifying FLAT_SCRATCH_LO (simply called FLAT_SCRATCH)
-             * in the instruction encoding and reading it as a 64b value
-             * so we only return the index to the lower half of the
-             * FLAT_SCRATCH register.
-             *
-             * FLAT_SCRATCH_LO = s[NUM_SGPRS - 4]
-             * FLAT_SCRATCH_HI = s[NUM_SGPRS - 3]
-             *
-             */
-            regIdx = numScalarRegs - 4;
-        } else if (idx == REG_FLAT_SCRATCH_HI) {
-            regIdx = numScalarRegs - 3;
-        }
-
-        return regIdx;
-    }
-
-    bool
-    isPosConstVal(int opIdx)
-    {
-        bool is_pos_const_val = (opIdx >= REG_INT_CONST_POS_MIN
-            && opIdx <= REG_INT_CONST_POS_MAX);
-
-        return is_pos_const_val;
-    }
-
-    bool
-    isNegConstVal(int opIdx)
-    {
-        bool is_neg_const_val = (opIdx >= REG_INT_CONST_NEG_MIN
-            && opIdx <= REG_INT_CONST_NEG_MAX);
-
-        return is_neg_const_val;
-    }
-
-    bool
-    isConstVal(int opIdx)
-    {
-        bool is_const_val = isPosConstVal(opIdx) || isNegConstVal(opIdx);
-        return is_const_val;
-    }
-
-    bool
-    isLiteral(int opIdx)
-    {
-        return opIdx == REG_SRC_LITERAL;
-    }
-
-    bool
-    isExecMask(int opIdx)
-    {
-        return opIdx == REG_EXEC_LO || opIdx == REG_EXEC_HI;
-    }
-
-    bool
-    isVccReg(int opIdx)
-    {
-        return opIdx == REG_VCC_LO || opIdx == REG_VCC_HI;
-    }
-
-    bool
-    isFlatScratchReg(int opIdx)
-    {
-        return opIdx == REG_FLAT_SCRATCH_LO || opIdx == REG_FLAT_SCRATCH_HI;
-    }
-
-    bool
-    isScalarReg(int opIdx)
-    {
-        // FLAT_SCRATCH and VCC are stored in an SGPR pair
-        if (opIdx <= REG_SGPR_MAX || opIdx == REG_FLAT_SCRATCH_LO ||
-            opIdx == REG_FLAT_SCRATCH_HI || opIdx == REG_VCC_LO ||
-            opIdx == REG_VCC_HI) {
-            return true;
-        }
-
-        return false;
-    }
-
-    bool
-    isVectorReg(int opIdx)
-    {
-        if (opIdx >= REG_VGPR_MIN && opIdx <= REG_VGPR_MAX)
-            return true;
-
-        return false;
-    }
-
-} // namespace Gcn3ISA
-} // namespace gem5
diff --git a/src/arch/amdgpu/vega/SConscript b/src/arch/amdgpu/vega/SConscript
index 9c6a01bf81..f40bac2a1e 100644
--- a/src/arch/amdgpu/vega/SConscript
+++ b/src/arch/amdgpu/vega/SConscript
@@ -49,11 +49,31 @@ Source('tlb_coalescer.cc')
 DebugFlag('GPUPTWalker', 'Debug flag for GPU page table walker')
 
 if env['CONF']['TARGET_GPU_ISA'] == 'vega':
-    Source('decoder.cc')
+    Source('gpu_decoder.cc')
     Source('insts/gpu_static_inst.cc')
-    Source('insts/instructions.cc')
     Source('insts/op_encodings.cc')
-    Source('isa.cc')
-    Source('registers.cc')
+    Source('gpu_isa.cc')
+    Source('gpu_registers.cc')
+
+    Source('insts/sop2.cc')
+    Source('insts/sopk.cc')
+    Source('insts/sop1.cc')
+    Source('insts/sopc.cc')
+    Source('insts/sopp.cc')
+    Source('insts/smem.cc')
+    Source('insts/vop2.cc')
+    Source('insts/vop1.cc')
+    Source('insts/vopc.cc')
+    Source('insts/vinterp.cc')
+    Source('insts/vop3.cc')
+    Source('insts/vop3_cmp.cc')
+    Source('insts/ds.cc')
+    Source('insts/mubuf.cc')
+    Source('insts/mtbuf.cc')
+    Source('insts/mimg.cc')
+    Source('insts/exp.cc')
+    Source('insts/flat.cc')
+    Source('insts/vop3p.cc')
+    Source('insts/vop3p_mai.cc')
 
     DebugFlag('VEGA', 'Debug flag for VEGA GPU ISA')
diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
similarity index 96%
rename from src/arch/amdgpu/vega/decoder.cc
rename to src/arch/amdgpu/vega/gpu_decoder.cc
index 065f8c8493..43c33e44cc 100644
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -29,11 +29,13 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "arch/amdgpu/vega/gpu_decoder.hh"
+
 #include <vector>
 
-#include "arch/amdgpu/vega/gpu_decoder.hh"
 #include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
 #include "arch/amdgpu/vega/insts/instructions.hh"
+#include "arch/amdgpu/vega/insts/vop3p.hh"
 
 namespace gem5
 {
@@ -498,10 +500,10 @@ namespace VegaISA
         &Decoder::subDecode_OP_FLAT,
         &Decoder::subDecode_OP_FLAT,
         &Decoder::subDecode_OP_FLAT,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::subDecode_OP_FLAT,
+        &Decoder::subDecode_OP_FLAT,
+        &Decoder::subDecode_OP_FLAT,
+        &Decoder::subDecode_OP_FLAT,
         &Decoder::subDecode_OP_MUBUF,
         &Decoder::subDecode_OP_MUBUF,
         &Decoder::subDecode_OP_MUBUF,
@@ -884,7 +886,7 @@ namespace VegaISA
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OPU_VOP3__V_FMAC_F32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -1089,7 +1091,7 @@ namespace VegaISA
         &Decoder::decode_OPU_VOP3__V_MAD_I16,
         &Decoder::decode_OPU_VOP3__V_FMA_F16,
         &Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F16,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -1236,14 +1238,14 @@ namespace VegaISA
         &Decoder::decode_OPU_VOP3__V_CVT_PK_I16_I32,
         &Decoder::decode_OPU_VOP3__V_PKNORM_I16_F16,
         &Decoder::decode_OPU_VOP3__V_PKNORM_U16_F16,
+        &Decoder::decode_invalid,
         &Decoder::decode_OPU_VOP3__V_ADD_I32,
         &Decoder::decode_OPU_VOP3__V_SUB_I32,
         &Decoder::decode_OPU_VOP3__V_ADD_I16,
         &Decoder::decode_OPU_VOP3__V_SUB_I16,
         &Decoder::decode_OPU_VOP3__V_PACK_B32_F16,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OPU_VOP3__V_CVT_PK_FP8_F32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -1678,9 +1680,9 @@ namespace VegaISA
         &Decoder::decode_OP_FLAT__FLAT_ATOMIC_DEC,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_ADD_F64,
+        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_MIN_F64,
+        &Decoder::decode_OP_FLAT__FLAT_ATOMIC_MAX_F64,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -1807,11 +1809,11 @@ namespace VegaISA
         &Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_XOR,
         &Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_INC,
         &Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_DEC,
+        &Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F32,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F64,
+        &Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_MIN_F64,
+        &Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_MAX_F64,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -3116,7 +3118,7 @@ namespace VegaISA
         &Decoder::decode_OP_VOP1__V_CLREXCP,
         &Decoder::decode_invalid,
         &Decoder::decode_OP_VOP1__V_SCREEN_PARTITION_4SE_B32,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP1__V_MOV_B64,
         &Decoder::decode_OP_VOP1__V_CVT_F16_U16,
         &Decoder::decode_OP_VOP1__V_CVT_F16_I16,
         &Decoder::decode_OP_VOP1__V_CVT_U16_F16,
@@ -3142,7 +3144,7 @@ namespace VegaISA
         &Decoder::decode_OP_VOP1__V_SAT_PK_U8_I16,
         &Decoder::decode_invalid,
         &Decoder::decode_OP_VOP1__V_SWAP_B32,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP1__V_ACCVGPR_MOV_B32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -3613,12 +3615,23 @@ namespace VegaISA
         &Decoder::decode_OP_VOP3P__V_MAD_MIX_F32,
         &Decoder::decode_OP_VOP3P__V_MAD_MIXLO_F16,
         &Decoder::decode_OP_VOP3P__V_MAD_MIXHI_F16,
+        &Decoder::decode_OP_VOP3P__V_DOT2_F32_F16,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_DOT2_I32_I16,
+        &Decoder::decode_OP_VOP3P__V_DOT2_U32_U16,
+        &Decoder::decode_OP_VOP3P__V_DOT4_I32_I8,
+        &Decoder::decode_OP_VOP3P__V_DOT4_U32_U8,
+        &Decoder::decode_OP_VOP3P__V_DOT8_I32_I4,
+        &Decoder::decode_OP_VOP3P__V_DOT8_U32_U4,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_PK_FMA_F32,
+        &Decoder::decode_OP_VOP3P__V_PK_MUL_F32,
+        &Decoder::decode_OP_VOP3P__V_PK_ADD_F32,
+        &Decoder::decode_OP_VOP3P__V_PK_MOV_B32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -3631,81 +3644,70 @@ namespace VegaISA
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X1_2B_F32,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X1_4B_F32,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_4X4X1_16B_F32,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X2_F32,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X4_F32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X4_2B_F16,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X4_4B_F16,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_4X4X4_16B_F16,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X8_F16,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X16_F16,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_MFMA_I32_32X32X4_2B_I8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X4_4B_I8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_I32_4X4X4_16B_I8,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_MFMA_I32_32X32X8_I8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16_I8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_I32_32X32X16_I8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X32_I8,
+        &Decoder::decode_OP_VOP3P__V_ACCVGPR_READ,
+        &Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X4_2B_BF16,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X4_4B_BF16,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_4X4X4_16B_BF16,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X8_BF16,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X16_BF16,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X32_F16,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X16_F16,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X32_BF16,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X16_BF16,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_I32_16X16X64_I8,
         &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_I32_32X32X32_I8,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4_F64,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F64_4X4X4_4B_F64,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X32_BF8_BF8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X32_BF8_FP8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X32_FP8_BF8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X32_FP8_FP8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X16_BF8_BF8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X16_BF8_FP8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X16_FP8_BF8,
+        &Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X16_FP8_FP8,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_BF8_BF8,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_BF8_FP8,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_FP8_BF8,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_FP8_FP8,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_BF8_BF8,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_BF8_FP8,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_FP8_BF8,
+        &Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_FP8_FP8,
     };
 
     GPUStaticInst*
@@ -4202,8 +4204,7 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_VOP2__V_FMAC_F32(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP2__V_FMAC_F32(&iFmt->iFmt_VOP2);
     }
 
     GPUStaticInst*
@@ -4216,8 +4217,7 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_VOP2__V_XNOR_B32(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP2__V_XNOR_B32(&iFmt->iFmt_VOP2);
     }
 
     GPUStaticInst*
@@ -6172,6 +6172,12 @@ namespace VegaISA
         return new Inst_VOP3__V_SUBREV_U32(&iFmt->iFmt_VOP3A);
     } // decode_OPU_VOP3__V_SUBREV_U32
 
+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_FMAC_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_FMAC_F32(&iFmt->iFmt_VOP3A);
+    } // decode_OPU_VOP3__V_FMAC_F32
+
     GPUStaticInst*
     Decoder::decode_OPU_VOP3__V_NOP(MachInst iFmt)
     {
@@ -7053,6 +7059,12 @@ namespace VegaISA
         return new Inst_VOP3__V_DIV_FIXUP_F16(&iFmt->iFmt_VOP3A);
     }
 
+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_LSHL_ADD_U64(&iFmt->iFmt_VOP3A);
+    }
+
     GPUStaticInst*
     Decoder::decode_OPU_VOP3__V_INTERP_P1_F32(MachInst iFmt)
     {
@@ -7289,6 +7301,12 @@ namespace VegaISA
         return nullptr;
     }
 
+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_CVT_PK_FP8_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_CVT_PK_FP8_F32(&iFmt->iFmt_VOP3A);
+    }
+
     GPUStaticInst*
     Decoder::decode_OP_DS__DS_ADD_U32(MachInst iFmt)
     {
@@ -7796,15 +7814,13 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_DS__DS_READ_U16_D16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_DS__DS_READ_U16_D16(&iFmt->iFmt_DS);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_DS__DS_READ_U16_D16_HI(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_DS__DS_READ_U16_D16_HI(&iFmt->iFmt_DS);
     }
 
     GPUStaticInst*
@@ -8292,8 +8308,7 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_FLAT__FLAT_STORE_SHORT_D16_HI(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
@@ -8440,6 +8455,24 @@ namespace VegaISA
         return new Inst_FLAT__FLAT_ATOMIC_DEC(&iFmt->iFmt_FLAT);
     } // decode_OP_FLAT__FLAT_ATOMIC_DEC
 
+    GPUStaticInst*
+    Decoder::decode_OP_FLAT__FLAT_ATOMIC_ADD_F64(MachInst iFmt)
+    {
+        return new Inst_FLAT__FLAT_ATOMIC_ADD_F64(&iFmt->iFmt_FLAT);
+    } // decode_OP_FLAT__FLAT_ATOMIC_ADD_F64
+
+    GPUStaticInst*
+    Decoder::decode_OP_FLAT__FLAT_ATOMIC_MIN_F64(MachInst iFmt)
+    {
+        return new Inst_FLAT__FLAT_ATOMIC_MIN_F64(&iFmt->iFmt_FLAT);
+    } // decode_OP_FLAT__FLAT_ATOMIC_MIN_F64
+
+    GPUStaticInst*
+    Decoder::decode_OP_FLAT__FLAT_ATOMIC_MAX_F64(MachInst iFmt)
+    {
+        return new Inst_FLAT__FLAT_ATOMIC_MAX_F64(&iFmt->iFmt_FLAT);
+    } // decode_OP_FLAT__FLAT_ATOMIC_MAX_F64
+
     GPUStaticInst*
     Decoder::decode_OP_FLAT__FLAT_ATOMIC_SWAP_X2(MachInst iFmt)
     {
@@ -8588,15 +8621,13 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_GLOBAL__GLOBAL_STORE_SHORT_D16_HI(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_GLOBAL__GLOBAL_STORE_DWORD(MachInst iFmt)
     {
         return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT);
-        return nullptr;
     }
 
     GPUStaticInst*
@@ -8737,6 +8768,40 @@ namespace VegaISA
         return new Inst_FLAT__FLAT_ATOMIC_DEC(&iFmt->iFmt_FLAT);
     }
 
+    GPUStaticInst*
+    Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F32(MachInst iFmt)
+    {
+        // Note: There is no flat_atomic_add_f32 as of MI200. However, gem5
+        // impelements all global and scratch instructions as Inst_FLAT.
+        return new Inst_FLAT__FLAT_ATOMIC_ADD_F32(&iFmt->iFmt_FLAT);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_PK_ADD_F16(MachInst iFmt)
+    {
+        // Note: There is no flat_atomic_pk_add_f16 as of MI200. However, gem5
+        // impelements all global and scratch instructions as Inst_FLAT.
+        return new Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16(&iFmt->iFmt_FLAT);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F64(MachInst iFmt)
+    {
+        return new Inst_FLAT__FLAT_ATOMIC_ADD_F64(&iFmt->iFmt_FLAT);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_MIN_F64(MachInst iFmt)
+    {
+        return new Inst_FLAT__FLAT_ATOMIC_MIN_F64(&iFmt->iFmt_FLAT);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_MAX_F64(MachInst iFmt)
+    {
+        return new Inst_FLAT__FLAT_ATOMIC_MAX_F64(&iFmt->iFmt_FLAT);
+    }
+
     GPUStaticInst*
     Decoder::decode_OP_GLOBAL__GLOBAL_ATOMIC_SWAP_X2(MachInst iFmt)
     {
@@ -9838,64 +9903,55 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_UBYTE(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_UBYTE(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_SBYTE(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_SBYTE(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_USHORT(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_USHORT(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_SSHORT(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_SSHORT(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORD(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORD(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX2(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX2(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX3(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX3(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX4(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX4(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_BYTE(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_BYTE(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
@@ -9908,43 +9964,37 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_SHORT(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_SHORT(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_SHORT_D16_HI(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORD(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX2(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX2(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX3(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX3(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX4(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX4(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
@@ -10154,14 +10204,12 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_MUBUF__BUFFER_LOAD_SHORT_D16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_MUBUF__BUFFER_LOAD_SHORT_D16(&iFmt->iFmt_MUBUF);
     }
     GPUStaticInst*
     Decoder::decode_OP_MUBUF__BUFFER_LOAD_SHORT_D16_HI(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(&iFmt->iFmt_MUBUF);
     }
     GPUStaticInst*
     Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_HI_X(MachInst iFmt)
@@ -11581,6 +11629,12 @@ namespace VegaISA
         return nullptr;
     }
 
+    GPUStaticInst*
+    Decoder::decode_OP_VOP1__V_MOV_B64(MachInst iFmt)
+    {
+        return new Inst_VOP1__V_MOV_B64(&iFmt->iFmt_VOP1);
+    } // decode_OP_VOP1__V_MOV_B64
+
     GPUStaticInst*
     Decoder::decode_OP_VOP1__V_CVT_F16_U16(MachInst iFmt)
     {
@@ -11729,6 +11783,12 @@ namespace VegaISA
         return nullptr;
     }
 
+    GPUStaticInst*
+    Decoder::decode_OP_VOP1__V_ACCVGPR_MOV_B32(MachInst iFmt)
+    {
+        return new Inst_VOP1__V_ACCVGPR_MOV_B32(&iFmt->iFmt_VOP1);
+    }
+
     GPUStaticInst*
     Decoder::decode_OP_VOPC__V_CMP_CLASS_F32(MachInst iFmt)
     {
@@ -12920,134 +12980,115 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MAD_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAD_I16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MUL_LO_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MUL_LO_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_ADD_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ADD_I16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_SUB_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_SUB_I16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_LSHLREV_B16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_LSHLREV_B16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_LSHRREV_B16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_LSHRREV_B16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_ASHRREV_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ASHRREV_B16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MAX_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAX_I16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MIN_I16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MIN_I16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MAD_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAD_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_ADD_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ADD_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_SUB_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_SUB_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MAX_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAX_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MIN_U16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MIN_U16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_FMA_F16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_FMA_F16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_ADD_F16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_ADD_F16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MUL_F16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MUL_F16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MIN_F16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MIN_F16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_PK_MAX_F16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P__V_PK_MAX_F16(&iFmt->iFmt_VOP3P);
     }
 
     GPUStaticInst*
@@ -13071,6 +13112,406 @@ namespace VegaISA
         return nullptr;
     }
 
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_PK_FMA_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_PK_FMA_F32(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_PK_MUL_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_PK_MUL_F32(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_PK_ADD_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_PK_ADD_F32(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_PK_MOV_B32(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_PK_MOV_B32(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_F32_F16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_F32_F16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_I32_I16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_I32_I16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT2_U32_U16(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT2_U32_U16(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT4_I32_I8(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT4_I32_I8(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT4_U32_U8(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT4_U32_U8(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT8_I32_I4(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT8_I32_I4(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_DOT8_U32_U4(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_DOT8_U32_U4(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X1_2B_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F32_32X32X1_2B_F32(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X1_4B_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F32_16X16X1_4B_F32(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_4X4X1_16B_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F32_4X4X1_16B_F32(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X2_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F32_32X32X2_F32(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X4_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F32_16X16X4_F32(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X4_2B_F16(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F32_32X32X4_2B_F16(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X4_4B_F16(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F32_16X16X4_4B_F16(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_4X4X4_16B_F16(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F32_4X4X4_16B_F16(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X8_F16(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F32_32X32X8_F16(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X16_F16(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F32_16X16X16_F16(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_I32_32X32X4_2B_I8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X4_4B_I8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_I32_4X4X4_16B_I8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16_I8(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_I32_16X16X16_I8(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_I32_32X32X8_I8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_I32_32X32X16_I8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X32_I8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X4_2B_BF16(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X4_4B_BF16(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_4X4X4_16B_BF16(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X8_BF16(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F32_32X32X8_BF16(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X16_BF16(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X32_F16(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X16_F16(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X32_BF16(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X16_BF16(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_I32_16X16X64_I8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_I32_32X32X32_I8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F64_4X4X4_4B_F64(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F64_4X4X4_4B_F64(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X32_BF8_BF8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X32_BF8_FP8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X32_FP8_BF8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_16X16X32_FP8_FP8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X16_BF8_BF8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X16_BF8_FP8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X16_FP8_BF8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X16_FP8_FP8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_BF8_BF8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_BF8_FP8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_FP8_BF8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_FP8_FP8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_BF8_BF8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_BF8_FP8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_FP8_BF8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_FP8_FP8(MachInst iFmt)
+    {
+        fatal("Trying to decode instruction without a class\n");
+        return nullptr;
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4_F64(MachInst iFmt)
+    {
+        return new Inst_VOP3P_MAI__V_MFMA_F64_16X16X4_F64(
+                &iFmt->iFmt_VOP3P_MAI);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_ACCVGPR_READ(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_ACCVGPR_READ(&iFmt->iFmt_VOP3P);
+    }
+
+    GPUStaticInst*
+    Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst iFmt)
+    {
+        return new Inst_VOP3P__V_ACCVGPR_WRITE(&iFmt->iFmt_VOP3P);
+    }
+
     GPUStaticInst*
     Decoder::decode_invalid(MachInst iFmt)
     {
diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index af989e0cc7..285377ad3d 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -325,6 +325,7 @@ namespace VegaISA
         GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_FMAC_F32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst);
@@ -470,6 +471,7 @@ namespace VegaISA
         GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst);
@@ -508,6 +510,7 @@ namespace VegaISA
         GPUStaticInst* decode_OPU_VOP3__V_ADD_I16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_SUB_I16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_PACK_B32_F16(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_CVT_PK_FP8_F32(MachInst);
         GPUStaticInst* decode_OP_DS__DS_ADD_U32(MachInst);
         GPUStaticInst* decode_OP_DS__DS_SUB_U32(MachInst);
         GPUStaticInst* decode_OP_DS__DS_RSUB_U32(MachInst);
@@ -698,6 +701,9 @@ namespace VegaISA
         GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_XOR(MachInst);
         GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_INC(MachInst);
         GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_DEC(MachInst);
+        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_ADD_F64(MachInst);
+        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_MIN_F64(MachInst);
+        GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_MAX_F64(MachInst);
         GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_SWAP_X2(MachInst);
         GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_CMPSWAP_X2(MachInst);
         GPUStaticInst* decode_OP_FLAT__FLAT_ATOMIC_ADD_X2(MachInst);
@@ -746,6 +752,11 @@ namespace VegaISA
         GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_XOR(MachInst);
         GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_INC(MachInst);
         GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_DEC(MachInst);
+        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F32(MachInst);
+        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_PK_ADD_F16(MachInst);
+        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_F64(MachInst);
+        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_MIN_F64(MachInst);
+        GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_MAX_F64(MachInst);
         GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_SWAP_X2(MachInst);
         GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_CMPSWAP_X2(MachInst);
         GPUStaticInst* decode_OP_GLOBAL__GLOBAL_ATOMIC_ADD_X2(MachInst);
@@ -1279,6 +1290,7 @@ namespace VegaISA
         GPUStaticInst* decode_OP_VOP1__V_FREXP_MANT_F32(MachInst);
         GPUStaticInst* decode_OP_VOP1__V_CLREXCP(MachInst);
         GPUStaticInst* decode_OP_VOP1__V_SCREEN_PARTITION_4SE_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP1__V_MOV_B64(MachInst);
         GPUStaticInst* decode_OP_VOP1__V_CVT_F16_U16(MachInst);
         GPUStaticInst* decode_OP_VOP1__V_CVT_F16_I16(MachInst);
         GPUStaticInst* decode_OP_VOP1__V_CVT_U16_F16(MachInst);
@@ -1303,6 +1315,7 @@ namespace VegaISA
         GPUStaticInst* decode_OP_VOP1__V_CVT_NORM_U16_F16(MachInst);
         GPUStaticInst* decode_OP_VOP1__V_SAT_PK_U8_I16(MachInst);
         GPUStaticInst* decode_OP_VOP1__V_SWAP_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP1__V_ACCVGPR_MOV_B32(MachInst);
         GPUStaticInst* decode_OP_VOP2__V_CNDMASK_B32(MachInst);
         GPUStaticInst* decode_OP_VOP2__V_ADD_F32(MachInst);
         GPUStaticInst* decode_OP_VOP2__V_SUB_F32(MachInst);
@@ -1585,6 +1598,65 @@ namespace VegaISA
         GPUStaticInst* decode_OP_VOP3P__V_MAD_MIX_F32(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXLO_F16(MachInst);
         GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXHI_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_FMA_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_F32_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_I32_I16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT2_U32_U16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT4_I32_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT4_U32_U8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT8_I32_I4(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_DOT8_U32_U4(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X1_2B_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X1_4B_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_4X4X1_16B_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X2_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X4_F32(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X4_2B_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X4_4B_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_4X4X4_16B_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X8_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X16_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_32X32X4_2B_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X4_4B_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_4X4X4_16B_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_32X32X8_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_32X32X16_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X32_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X4_2B_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X4_4B_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_4X4X4_16B_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X8_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X16_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X32_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X16_F16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X32_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X16_BF16(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_I32_16X16X64_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_I32_32X32X32_I8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4_F64(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_4X4X4_4B_F64(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_BF8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_BF8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_FP8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_16X16X32_FP8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_BF8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_BF8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_FP8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_MFMA_F32_32X32X16_FP8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_BF8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_BF8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_FP8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_16X16X64_FP8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_BF8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_BF8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_FP8_BF8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_SMFMAC_F32_32X32X32_FP8_FP8(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_READ(MachInst);
+        GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst);
         GPUStaticInst* subDecode_OPU_VOP3(MachInst);
         GPUStaticInst* subDecode_OP_DS(MachInst);
         GPUStaticInst* subDecode_OP_FLAT(MachInst);
@@ -1642,7 +1714,7 @@ namespace VegaISA
 
     struct InFmt_FLAT {
         unsigned int    OFFSET : 13;
-        unsigned int       LDS : 1;
+        unsigned int       SVE : 1;
         unsigned int       SEG : 2;
         unsigned int       GLC : 1;
         unsigned int       SLC : 1;
@@ -1908,7 +1980,27 @@ namespace VegaISA
         unsigned int       NEG : 3;
     };
 
-    union InstFormat {
+    struct InFmt_VOP3P_MAI
+    {
+        unsigned int      VDST : 8;
+        unsigned int      CBSZ : 3;
+        unsigned int      ABID : 4;
+        unsigned int    ACC_CD : 1;
+        unsigned int        OP : 7;
+        unsigned int  ENCODING : 9;
+    };
+
+    struct InFmt_VOP3P_MAI_1
+    {
+        unsigned int  SRC0 : 9;
+        unsigned int  SRC1 : 9;
+        unsigned int  SRC2 : 9;
+        unsigned int   ACC : 2;
+        unsigned int  BLGP : 3;
+    };
+
+    union InstFormat
+    {
         InFmt_DS            iFmt_DS;
         InFmt_DS_1          iFmt_DS_1;
         InFmt_EXP           iFmt_EXP;
@@ -1941,6 +2033,8 @@ namespace VegaISA
         InFmt_VOP_SDWAB     iFmt_VOP_SDWAB;
         InFmt_VOP3P         iFmt_VOP3P;
         InFmt_VOP3P_1       iFmt_VOP3P_1;
+        InFmt_VOP3P_MAI     iFmt_VOP3P_MAI;
+        InFmt_VOP3P_MAI_1   iFmt_VOP3P_MAI_1;
         uint32_t            imm_u32;
         float               imm_f32;
     }; // union InstFormat
diff --git a/src/arch/amdgpu/vega/isa.cc b/src/arch/amdgpu/vega/gpu_isa.cc
similarity index 100%
rename from src/arch/amdgpu/vega/isa.cc
rename to src/arch/amdgpu/vega/gpu_isa.cc
diff --git a/src/arch/amdgpu/vega/registers.cc b/src/arch/amdgpu/vega/gpu_registers.cc
similarity index 95%
rename from src/arch/amdgpu/vega/registers.cc
rename to src/arch/amdgpu/vega/gpu_registers.cc
index b7404379cc..302a89e2b8 100644
--- a/src/arch/amdgpu/vega/registers.cc
+++ b/src/arch/amdgpu/vega/gpu_registers.cc
@@ -89,6 +89,18 @@ namespace VegaISA
           case REG_ZERO:
             reg_sym = "0";
             break;
+          case REG_SHARED_BASE:
+            reg_sym = "src_shared_base";
+            break;
+          case REG_SHARED_LIMIT:
+            reg_sym = "src_shared_limit";
+            break;
+          case REG_PRIVATE_BASE:
+            reg_sym = "src_private_base";
+            break;
+          case REG_PRIVATE_LIMIT:
+            reg_sym = "src_private_limit";
+            break;
           case REG_POS_HALF:
             reg_sym = "0.5";
             break;
diff --git a/src/arch/amdgpu/vega/gpu_registers.hh b/src/arch/amdgpu/vega/gpu_registers.hh
index 63929d5917..f4d34a571c 100644
--- a/src/arch/amdgpu/vega/gpu_registers.hh
+++ b/src/arch/amdgpu/vega/gpu_registers.hh
@@ -106,10 +106,10 @@ namespace VegaISA
         REG_RESERVED_25 = 232,
         REG_RESERVED_26 = 233,
         REG_RESERVED_27 = 234,
-        REG_RESERVED_28 = 235,
-        REG_RESERVED_29 = 236,
-        REG_RESERVED_30 = 237,
-        REG_RESERVED_31 = 238,
+        REG_SHARED_BASE = 235,
+        REG_SHARED_LIMIT = 236,
+        REG_PRIVATE_BASE = 237,
+        REG_PRIVATE_LIMIT = 238,
         REG_RESERVED_32 = 239,
         REG_POS_HALF = 240,
         REG_NEG_HALF = 241,
@@ -129,7 +129,7 @@ namespace VegaISA
         REG_LDS_DIRECT = 254,
         REG_SRC_LITERAL = 255,
         REG_VGPR_MIN = 256,
-        REG_VGPR_MAX = 511
+        REG_VGPR_MAX = 767
     };
 
     constexpr size_t MaxOperandDwords(16);
diff --git a/src/arch/amdgpu/vega/insts/ds.cc b/src/arch/amdgpu/vega/insts/ds.cc
new file mode 100644
index 0000000000..c377daa487
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/ds.cc
@@ -0,0 +1,4786 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_DS__DS_ADD_U32 class methods ---
+
+    Inst_DS__DS_ADD_U32::Inst_DS__DS_ADD_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_u32")
+    {
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicAdd);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_ADD_U32
+
+    Inst_DS__DS_ADD_U32::~Inst_DS__DS_ADD_U32()
+    {
+    } // ~Inst_DS__DS_ADD_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] += DATA;
+    void
+    Inst_DS__DS_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_ADD_U32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_ADD_U32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_SUB_U32 class methods ---
+
+    Inst_DS__DS_SUB_U32::Inst_DS__DS_SUB_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_u32")
+    {
+    } // Inst_DS__DS_SUB_U32
+
+    Inst_DS__DS_SUB_U32::~Inst_DS__DS_SUB_U32()
+    {
+    } // ~Inst_DS__DS_SUB_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_U32 class methods ---
+
+    Inst_DS__DS_RSUB_U32::Inst_DS__DS_RSUB_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_u32")
+    {
+    } // Inst_DS__DS_RSUB_U32
+
+    Inst_DS__DS_RSUB_U32::~Inst_DS__DS_RSUB_U32()
+    {
+    } // ~Inst_DS__DS_RSUB_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_U32 class methods ---
+
+    Inst_DS__DS_INC_U32::Inst_DS__DS_INC_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_u32")
+    {
+    } // Inst_DS__DS_INC_U32
+
+    Inst_DS__DS_INC_U32::~Inst_DS__DS_INC_U32()
+    {
+    } // ~Inst_DS__DS_INC_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_INC_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_U32 class methods ---
+
+    Inst_DS__DS_DEC_U32::Inst_DS__DS_DEC_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_u32")
+    {
+    } // Inst_DS__DS_DEC_U32
+
+    Inst_DS__DS_DEC_U32::~Inst_DS__DS_DEC_U32()
+    {
+    } // ~Inst_DS__DS_DEC_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_DEC_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_I32 class methods ---
+
+    Inst_DS__DS_MIN_I32::Inst_DS__DS_MIN_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_i32")
+    {
+    } // Inst_DS__DS_MIN_I32
+
+    Inst_DS__DS_MIN_I32::~Inst_DS__DS_MIN_I32()
+    {
+    } // ~Inst_DS__DS_MIN_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_I32 class methods ---
+
+    Inst_DS__DS_MAX_I32::Inst_DS__DS_MAX_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_i32")
+    {
+    } // Inst_DS__DS_MAX_I32
+
+    Inst_DS__DS_MAX_I32::~Inst_DS__DS_MAX_I32()
+    {
+    } // ~Inst_DS__DS_MAX_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_U32 class methods ---
+
+    Inst_DS__DS_MIN_U32::Inst_DS__DS_MIN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_u32")
+    {
+    } // Inst_DS__DS_MIN_U32
+
+    Inst_DS__DS_MIN_U32::~Inst_DS__DS_MIN_U32()
+    {
+    } // ~Inst_DS__DS_MIN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_U32 class methods ---
+
+    Inst_DS__DS_MAX_U32::Inst_DS__DS_MAX_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_u32")
+    {
+    } // Inst_DS__DS_MAX_U32
+
+    Inst_DS__DS_MAX_U32::~Inst_DS__DS_MAX_U32()
+    {
+    } // ~Inst_DS__DS_MAX_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_B32 class methods ---
+
+    Inst_DS__DS_AND_B32::Inst_DS__DS_AND_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_b32")
+    {
+    } // Inst_DS__DS_AND_B32
+
+    Inst_DS__DS_AND_B32::~Inst_DS__DS_AND_B32()
+    {
+    } // ~Inst_DS__DS_AND_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_B32 class methods ---
+
+    Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicOr);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_OR_B32
+
+    Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
+    {
+    } // ~Inst_DS__DS_OR_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] |= DATA;
+    void
+    Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_OR_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_OR_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+
+    // --- Inst_DS__DS_XOR_B32 class methods ---
+
+    Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_b32")
+    {
+    } // Inst_DS__DS_XOR_B32
+
+    Inst_DS__DS_XOR_B32::~Inst_DS__DS_XOR_B32()
+    {
+    } // ~Inst_DS__DS_XOR_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_B32 class methods ---
+
+    Inst_DS__DS_MSKOR_B32::Inst_DS__DS_MSKOR_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_b32")
+    {
+    } // Inst_DS__DS_MSKOR_B32
+
+    Inst_DS__DS_MSKOR_B32::~Inst_DS__DS_MSKOR_B32()
+    {
+    } // ~Inst_DS__DS_MSKOR_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_B32 class methods ---
+
+    Inst_DS__DS_WRITE_B32::Inst_DS__DS_WRITE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B32
+
+    Inst_DS__DS_WRITE_B32::~Inst_DS__DS_WRITE_B32()
+    {
+    } // ~Inst_DS__DS_WRITE_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] = DATA.
+    // Write dword.
+    void
+    Inst_DS__DS_WRITE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE2_B32 class methods ---
+
+    Inst_DS__DS_WRITE2_B32::Inst_DS__DS_WRITE2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2_B32
+
+    Inst_DS__DS_WRITE2_B32::~Inst_DS__DS_WRITE2_B32()
+    {
+    } // ~Inst_DS__DS_WRITE2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR_BASE + OFFSET0 * 4] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 4] = DATA2.
+    // Write 2 dwords.
+    void
+    Inst_DS__DS_WRITE2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 4;
+        Addr offset1 = instData.OFFSET1 * 4;
+
+        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_WRITE2ST64_B32 class methods ---
+
+    Inst_DS__DS_WRITE2ST64_B32::Inst_DS__DS_WRITE2ST64_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2st64_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2ST64_B32
+
+    Inst_DS__DS_WRITE2ST64_B32::~Inst_DS__DS_WRITE2ST64_B32()
+    {
+    } // ~Inst_DS__DS_WRITE2ST64_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR_BASE + OFFSET0 * 4 * 64] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 4 * 64] = DATA2;
+    // Write 2 dwords.
+    void
+    Inst_DS__DS_WRITE2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 4 * 64;
+        Addr offset1 = instData.OFFSET1 * 4 * 64;
+
+        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_CMPST_B32 class methods ---
+
+    Inst_DS__DS_CMPST_B32::Inst_DS__DS_CMPST_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_b32")
+    {
+    } // Inst_DS__DS_CMPST_B32
+
+    Inst_DS__DS_CMPST_B32::~Inst_DS__DS_CMPST_B32()
+    {
+    } // ~Inst_DS__DS_CMPST_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_F32 class methods ---
+
+    Inst_DS__DS_CMPST_F32::Inst_DS__DS_CMPST_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_CMPST_F32
+
+    Inst_DS__DS_CMPST_F32::~Inst_DS__DS_CMPST_F32()
+    {
+    } // ~Inst_DS__DS_CMPST_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_F32 class methods ---
+
+    Inst_DS__DS_MIN_F32::Inst_DS__DS_MIN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MIN_F32
+
+    Inst_DS__DS_MIN_F32::~Inst_DS__DS_MIN_F32()
+    {
+    } // ~Inst_DS__DS_MIN_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN.
+    void
+    Inst_DS__DS_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_F32 class methods ---
+
+    Inst_DS__DS_MAX_F32::Inst_DS__DS_MAX_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MAX_F32
+
+    Inst_DS__DS_MAX_F32::~Inst_DS__DS_MAX_F32()
+    {
+    } // ~Inst_DS__DS_MAX_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX.
+    void
+    Inst_DS__DS_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_NOP class methods ---
+
+    Inst_DS__DS_NOP::Inst_DS__DS_NOP(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_nop")
+    {
+        setFlag(Nop);
+    } // Inst_DS__DS_NOP
+
+    Inst_DS__DS_NOP::~Inst_DS__DS_NOP()
+    {
+    } // ~Inst_DS__DS_NOP
+
+    // --- description from .arch file ---
+    // Do nothing.
+    void
+    Inst_DS__DS_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        gpuDynInst->wavefront()->decLGKMInstsIssued();
+    } // execute
+    // --- Inst_DS__DS_ADD_F32 class methods ---
+
+    Inst_DS__DS_ADD_F32::Inst_DS__DS_ADD_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_f32")
+    {
+        setFlag(F32);
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicAdd);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_ADD_F32
+
+    Inst_DS__DS_ADD_F32::~Inst_DS__DS_ADD_F32()
+    {
+    } // ~Inst_DS__DS_ADD_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // MEM[ADDR] += DATA;
+    // Floating point add that handles NaN/INF/denormal values.
+    void
+    Inst_DS__DS_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandF32 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemF32*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_ADD_F32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemF32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_ADD_F32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B8 class methods ---
+
+    Inst_DS__DS_WRITE_B8::Inst_DS__DS_WRITE_B8(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B8
+
+    Inst_DS__DS_WRITE_B8::~Inst_DS__DS_WRITE_B8()
+    {
+    } // ~Inst_DS__DS_WRITE_B8
+
+    // --- description from .arch file ---
+    // MEM[ADDR] = DATA[7:0].
+    // Byte write.
+    void
+    Inst_DS__DS_WRITE_B8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B8_D16_HI class methods ---
+
+    Inst_DS__DS_WRITE_B8_D16_HI::Inst_DS__DS_WRITE_B8_D16_HI(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b8_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B8_D16_HI
+
+    Inst_DS__DS_WRITE_B8_D16_HI::~Inst_DS__DS_WRITE_B8_D16_HI()
+    {
+    } // ~Inst_DS__DS_WRITE_B8_D16_HI
+
+    // --- description from .arch file ---
+    // MEM[ADDR] = DATA[23:16].
+    // Byte write in to high word.
+    void
+    Inst_DS__DS_WRITE_B8_D16_HI::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
+                    = bits(data[lane], 23, 16);
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B8_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B8_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B16 class methods ---
+
+    Inst_DS__DS_WRITE_B16::Inst_DS__DS_WRITE_B16(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B16
+
+    Inst_DS__DS_WRITE_B16::~Inst_DS__DS_WRITE_B16()
+    {
+    } // ~Inst_DS__DS_WRITE_B16
+
+    // --- description from .arch file ---
+    // MEM[ADDR] = DATA[15:0]
+    // Short write.
+    void
+    Inst_DS__DS_WRITE_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU16 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU16>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_ADD_RTN_U32 class methods ---
+
+    Inst_DS__DS_ADD_RTN_U32::Inst_DS__DS_ADD_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_rtn_u32")
+    {
+    } // Inst_DS__DS_ADD_RTN_U32
+
+    Inst_DS__DS_ADD_RTN_U32::~Inst_DS__DS_ADD_RTN_U32()
+    {
+    } // ~Inst_DS__DS_ADD_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_ADD_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_RTN_U32 class methods ---
+
+    Inst_DS__DS_SUB_RTN_U32::Inst_DS__DS_SUB_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_rtn_u32")
+    {
+    } // Inst_DS__DS_SUB_RTN_U32
+
+    Inst_DS__DS_SUB_RTN_U32::~Inst_DS__DS_SUB_RTN_U32()
+    {
+    } // ~Inst_DS__DS_SUB_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_SUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_RTN_U32 class methods ---
+
+    Inst_DS__DS_RSUB_RTN_U32::Inst_DS__DS_RSUB_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_rtn_u32")
+    {
+    } // Inst_DS__DS_RSUB_RTN_U32
+
+    Inst_DS__DS_RSUB_RTN_U32::~Inst_DS__DS_RSUB_RTN_U32()
+    {
+    } // ~Inst_DS__DS_RSUB_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_RTN_U32 class methods ---
+
+    Inst_DS__DS_INC_RTN_U32::Inst_DS__DS_INC_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_rtn_u32")
+    {
+    } // Inst_DS__DS_INC_RTN_U32
+
+    Inst_DS__DS_INC_RTN_U32::~Inst_DS__DS_INC_RTN_U32()
+    {
+    } // ~Inst_DS__DS_INC_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_INC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_RTN_U32 class methods ---
+
+    Inst_DS__DS_DEC_RTN_U32::Inst_DS__DS_DEC_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_rtn_u32")
+    {
+    } // Inst_DS__DS_DEC_RTN_U32
+
+    Inst_DS__DS_DEC_RTN_U32::~Inst_DS__DS_DEC_RTN_U32()
+    {
+    } // ~Inst_DS__DS_DEC_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_DEC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_I32 class methods ---
+
+    Inst_DS__DS_MIN_RTN_I32::Inst_DS__DS_MIN_RTN_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_i32")
+    {
+    } // Inst_DS__DS_MIN_RTN_I32
+
+    Inst_DS__DS_MIN_RTN_I32::~Inst_DS__DS_MIN_RTN_I32()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_I32 class methods ---
+
+    Inst_DS__DS_MAX_RTN_I32::Inst_DS__DS_MAX_RTN_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_i32")
+    {
+    } // Inst_DS__DS_MAX_RTN_I32
+
+    Inst_DS__DS_MAX_RTN_I32::~Inst_DS__DS_MAX_RTN_I32()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_U32 class methods ---
+
+    Inst_DS__DS_MIN_RTN_U32::Inst_DS__DS_MIN_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_u32")
+    {
+    } // Inst_DS__DS_MIN_RTN_U32
+
+    Inst_DS__DS_MIN_RTN_U32::~Inst_DS__DS_MIN_RTN_U32()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_U32 class methods ---
+
+    Inst_DS__DS_MAX_RTN_U32::Inst_DS__DS_MAX_RTN_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_u32")
+    {
+    } // Inst_DS__DS_MAX_RTN_U32
+
+    Inst_DS__DS_MAX_RTN_U32::~Inst_DS__DS_MAX_RTN_U32()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_RTN_B32 class methods ---
+
+    Inst_DS__DS_AND_RTN_B32::Inst_DS__DS_AND_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_rtn_b32")
+    {
+    } // Inst_DS__DS_AND_RTN_B32
+
+    Inst_DS__DS_AND_RTN_B32::~Inst_DS__DS_AND_RTN_B32()
+    {
+    } // ~Inst_DS__DS_AND_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_AND_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_RTN_B32 class methods ---
+
+    Inst_DS__DS_OR_RTN_B32::Inst_DS__DS_OR_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_rtn_b32")
+    {
+    } // Inst_DS__DS_OR_RTN_B32
+
+    Inst_DS__DS_OR_RTN_B32::~Inst_DS__DS_OR_RTN_B32()
+    {
+    } // ~Inst_DS__DS_OR_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_OR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_RTN_B32 class methods ---
+
+    Inst_DS__DS_XOR_RTN_B32::Inst_DS__DS_XOR_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_rtn_b32")
+    {
+    } // Inst_DS__DS_XOR_RTN_B32
+
+    Inst_DS__DS_XOR_RTN_B32::~Inst_DS__DS_XOR_RTN_B32()
+    {
+    } // ~Inst_DS__DS_XOR_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_XOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_RTN_B32 class methods ---
+
+    Inst_DS__DS_MSKOR_RTN_B32::Inst_DS__DS_MSKOR_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_rtn_b32")
+    {
+    } // Inst_DS__DS_MSKOR_RTN_B32
+
+    Inst_DS__DS_MSKOR_RTN_B32::~Inst_DS__DS_MSKOR_RTN_B32()
+    {
+    } // ~Inst_DS__DS_MSKOR_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRXCHG_RTN_B32::Inst_DS__DS_WRXCHG_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg_rtn_b32")
+    {
+    } // Inst_DS__DS_WRXCHG_RTN_B32
+
+    Inst_DS__DS_WRXCHG_RTN_B32::~Inst_DS__DS_WRXCHG_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRXCHG_RTN_B32
+
+    // --- description from .arch file ---
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    // Write-exchange operation.
+    void
+    Inst_DS__DS_WRXCHG_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRXCHG2_RTN_B32::Inst_DS__DS_WRXCHG2_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b32")
+    {
+    } // Inst_DS__DS_WRXCHG2_RTN_B32
+
+    Inst_DS__DS_WRXCHG2_RTN_B32::~Inst_DS__DS_WRXCHG2_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRXCHG2_RTN_B32
+
+    // --- description from .arch file ---
+    // Write-exchange 2 separate dwords.
+    void
+    Inst_DS__DS_WRXCHG2_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B32::Inst_DS__DS_WRXCHG2ST64_RTN_B32(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b32")
+    {
+    } // Inst_DS__DS_WRXCHG2ST64_RTN_B32
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B32::~Inst_DS__DS_WRXCHG2ST64_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B32
+
+    // --- description from .arch file ---
+    // Write-exchange 2 separate dwords with a stride of 64 dwords.
+    void
+    Inst_DS__DS_WRXCHG2ST64_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_B32 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_B32::Inst_DS__DS_CMPST_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_b32")
+    {
+    } // Inst_DS__DS_CMPST_RTN_B32
+
+    Inst_DS__DS_CMPST_RTN_B32::~Inst_DS__DS_CMPST_RTN_B32()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_F32 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_F32::Inst_DS__DS_CMPST_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_CMPST_RTN_F32
+
+    Inst_DS__DS_CMPST_RTN_F32::~Inst_DS__DS_CMPST_RTN_F32()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_F32 class methods ---
+
+    Inst_DS__DS_MIN_RTN_F32::Inst_DS__DS_MIN_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MIN_RTN_F32
+
+    Inst_DS__DS_MIN_RTN_F32::~Inst_DS__DS_MIN_RTN_F32()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN.
+    void
+    Inst_DS__DS_MIN_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_F32 class methods ---
+
+    Inst_DS__DS_MAX_RTN_F32::Inst_DS__DS_MAX_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MAX_RTN_F32
+
+    Inst_DS__DS_MAX_RTN_F32::~Inst_DS__DS_MAX_RTN_F32()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX.
+    void
+    Inst_DS__DS_MAX_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRAP_RTN_B32 class methods ---
+
+    Inst_DS__DS_WRAP_RTN_B32::Inst_DS__DS_WRAP_RTN_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrap_rtn_b32")
+    {
+    } // Inst_DS__DS_WRAP_RTN_B32
+
+    Inst_DS__DS_WRAP_RTN_B32::~Inst_DS__DS_WRAP_RTN_B32()
+    {
+    } // ~Inst_DS__DS_WRAP_RTN_B32
+
+    // --- description from .arch file ---
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? tmp - DATA : tmp + DATA2;
+    // RETURN_DATA = tmp.
+    void
+    Inst_DS__DS_WRAP_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_RTN_F32 class methods ---
+
+    Inst_DS__DS_ADD_RTN_F32::Inst_DS__DS_ADD_RTN_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_rtn_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_ADD_RTN_F32
+
+    Inst_DS__DS_ADD_RTN_F32::~Inst_DS__DS_ADD_RTN_F32()
+    {
+    } // ~Inst_DS__DS_ADD_RTN_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    // Floating point add that handles NaN/INF/denormal values.
+    void
+    Inst_DS__DS_ADD_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_READ_B32 class methods ---
+
+    Inst_DS__DS_READ_B32::Inst_DS__DS_READ_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B32
+
+    Inst_DS__DS_READ_B32::~Inst_DS__DS_READ_B32()
+    {
+    } // ~Inst_DS__DS_READ_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA = MEM[ADDR].
+    // Dword read.
+    void
+    Inst_DS__DS_READ_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU32>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2_B32 class methods ---
+
+    Inst_DS__DS_READ2_B32::Inst_DS__DS_READ2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2_B32
+
+    Inst_DS__DS_READ2_B32::~Inst_DS__DS_READ2_B32()
+    {
+    } // ~Inst_DS__DS_READ2_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4].
+    // Read 2 dwords.
+    void
+    Inst_DS__DS_READ2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 4;
+        Addr offset1 = instData.OFFSET1 * 4;
+
+        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2ST64_B32 class methods ---
+
+    Inst_DS__DS_READ2ST64_B32::Inst_DS__DS_READ2ST64_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2st64_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2ST64_B32
+
+    Inst_DS__DS_READ2ST64_B32::~Inst_DS__DS_READ2ST64_B32()
+    {
+    } // ~Inst_DS__DS_READ2ST64_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4 * 64];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4 * 64].
+    // Read 2 dwords.
+    void
+    Inst_DS__DS_READ2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = (instData.OFFSET0 * 4 * 64);
+        Addr offset1 = (instData.OFFSET1 * 4 * 64);
+
+        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_READ2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    }
+    // --- Inst_DS__DS_READ_I8 class methods ---
+
+    Inst_DS__DS_READ_I8::Inst_DS__DS_READ_I8(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_i8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_I8
+
+    Inst_DS__DS_READ_I8::~Inst_DS__DS_READ_I8()
+    {
+    } // ~Inst_DS__DS_READ_I8
+
+    // --- description from .arch file ---
+    // RETURN_DATA = signext(MEM[ADDR][7:0]).
+    // Signed byte read.
+    void
+    Inst_DS__DS_READ_I8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_I8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemI8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_I8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)sext<8>((reinterpret_cast<VecElemI8*>(
+                    gpuDynInst->d_data))[lane]);
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ_U8 class methods ---
+
+    Inst_DS__DS_READ_U8::Inst_DS__DS_READ_U8(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_u8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_U8
+
+    Inst_DS__DS_READ_U8::~Inst_DS__DS_READ_U8()
+    {
+    } // ~Inst_DS__DS_READ_U8
+
+    // --- description from .arch file ---
+    // RETURN_DATA = {24'h0,MEM[ADDR][7:0]}.
+    // Unsigned byte read.
+    void
+    Inst_DS__DS_READ_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_U8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU8>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU8*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ_I16 class methods ---
+
+    Inst_DS__DS_READ_I16::Inst_DS__DS_READ_I16(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_i16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_I16
+
+    Inst_DS__DS_READ_I16::~Inst_DS__DS_READ_I16()
+    {
+    } // ~Inst_DS__DS_READ_I16
+
+    // --- description from .arch file ---
+    // RETURN_DATA = signext(MEM[ADDR][15:0]).
+    // Signed short read.
+    void
+    Inst_DS__DS_READ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_READ_U16 class methods ---
+
+    Inst_DS__DS_READ_U16::Inst_DS__DS_READ_U16(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_u16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_U16
+
+    Inst_DS__DS_READ_U16::~Inst_DS__DS_READ_U16()
+    {
+    } // ~Inst_DS__DS_READ_U16
+
+    // --- description from .arch file ---
+    // RETURN_DATA = {16'h0,MEM[ADDR][15:0]}.
+    // Unsigned short read.
+    void
+    Inst_DS__DS_READ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+    void
+    Inst_DS__DS_READ_U16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU16>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU16*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ_U16_D16 class methods ---
+
+    Inst_DS__DS_READ_U16_D16::
+        Inst_DS__DS_READ_U16_D16(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_u16_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_U16_D16
+
+    Inst_DS__DS_READ_U16_D16::~Inst_DS__DS_READ_U16_D16()
+    {
+    } // ~Inst_DS__DS_READ_U16_D16
+
+    // --- description from .arch file ---
+    // RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16;
+    // // RETURN_DATA[31:16] is preserved.
+    void
+    Inst_DS__DS_READ_U16_D16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+    void
+    Inst_DS__DS_READ_U16_D16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU16>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_U16_D16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                VecElemU16 ds_val = reinterpret_cast<VecElemU16*>(
+                    gpuDynInst->d_data)[lane];
+                replaceBits(vdst[lane], 15, 0, ds_val);
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ_U16_D16_HI class methods ---
+
+    Inst_DS__DS_READ_U16_D16_HI::
+        Inst_DS__DS_READ_U16_D16_HI(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_u16_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_U16_D16_HI
+
+    Inst_DS__DS_READ_U16_D16_HI::~Inst_DS__DS_READ_U16_D16_HI()
+    {
+    } // ~Inst_DS__DS_READ_U16_D16_HI
+
+    // --- description from .arch file ---
+    // RETURN_DATA[31 : 16].u16 = MEM[ADDR].u16;
+    // // RETURN_DATA[15:0] is preserved.
+    void
+    Inst_DS__DS_READ_U16_D16_HI::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+    void
+    Inst_DS__DS_READ_U16_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU16>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_U16_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                VecElemU16 ds_val = reinterpret_cast<VecElemU16*>(
+                    gpuDynInst->d_data)[lane];
+                replaceBits(vdst[lane], 31, 16, ds_val);
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_SWIZZLE_B32 class methods ---
+
+    Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_swizzle_b32")
+    {
+        /**
+         * While this operation doesn't actually use DS storage we classify
+         * it as a load here because it does a writeback to a VGPR, which
+         * fits in better with the LDS pipeline logic.
+         */
+         setFlag(Load);
+         setFlag(ALU);
+    } // Inst_DS__DS_SWIZZLE_B32
+
+    Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
+    {
+    } // ~Inst_DS__DS_SWIZZLE_B32
+
+    // --- description from .arch file ---
+    // RETURN_DATA = swizzle(vgpr_data, offset1:offset0).
+    // Dword swizzle, no data is written to LDS memory; See ds_opcodes.docx for
+    // ---  details.
+    void
+    Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        wf->decLGKMInstsIssued();
+
+        if (gpuDynInst->exec_mask.none()) {
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()
+                                ->cyclesToTicks(Cycles(24)));
+
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+        /**
+         * The "DS pattern" is comprised of both offset fields. That is, the
+         * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
+         * which swizzle mode to use. There are two different swizzle
+         * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
+         * QDMode else use Bit-masks mode. The remaining bits dictate how to
+         * swizzle the lanes.
+         *
+         * QDMode:      Chunks the lanes into 4s and swizzles among them.
+         *              Bits 7:6 dictate where lane 3 (of the current chunk)
+         *              gets its date, 5:4 lane 2, etc.
+         *
+         * Bit-mask:    This mode breaks bits 14:0 into 3 equal-sized chunks.
+         *              14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
+         *              is the and_mask. Each lane is swizzled by performing
+         *              the appropriate operation using these masks.
+         */
+        VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0);
+
+        data.read();
+
+        if (bits(ds_pattern, 15)) {
+            // QDMode
+            for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
+                /**
+                 * This operation allows data sharing between groups
+                 * of four consecutive threads. Note the increment by
+                 * 4 in the for loop.
+                 */
+                if (gpuDynInst->exec_mask[lane]) {
+                    int index0 = lane + bits(ds_pattern, 1, 0);
+                    panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index0);
+                    vdst[lane]
+                        = gpuDynInst->exec_mask[index0] ? data[index0]: 0;
+                }
+                if (gpuDynInst->exec_mask[lane + 1]) {
+                    int index1 = lane + bits(ds_pattern, 3, 2);
+                    panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index1);
+                    vdst[lane + 1]
+                        = gpuDynInst->exec_mask[index1] ? data[index1]: 0;
+                }
+                if (gpuDynInst->exec_mask[lane + 2]) {
+                    int index2 = lane + bits(ds_pattern, 5, 4);
+                    panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index2);
+                    vdst[lane + 2]
+                        = gpuDynInst->exec_mask[index2] ? data[index2]: 0;
+                }
+                if (gpuDynInst->exec_mask[lane + 3]) {
+                    int index3 = lane + bits(ds_pattern, 7, 6);
+                    panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) "
+                             "is out of bounds.\n", gpuDynInst->disassemble(),
+                             index3);
+                    vdst[lane + 3]
+                        = gpuDynInst->exec_mask[index3] ? data[index3]: 0;
+                }
+            }
+        } else {
+            // Bit Mode
+            int and_mask = bits(ds_pattern, 4, 0);
+            int or_mask = bits(ds_pattern, 9, 5);
+            int xor_mask = bits(ds_pattern, 14, 10);
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    int index = (((lane & and_mask) | or_mask) ^ xor_mask);
+                    // Adjust for the next 32 lanes.
+                    if (lane > 31) {
+                        index += 32;
+                    }
+                    panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is "
+                             "out of bounds.\n", gpuDynInst->disassemble(),
+                             index);
+                    vdst[lane]
+                        = gpuDynInst->exec_mask[index] ? data[index] : 0;
+                }
+            }
+        }
+
+        vdst.write();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
+    } // execute
+    // --- Inst_DS__DS_PERMUTE_B32 class methods ---
+
+    Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_permute_b32")
+    {
+        setFlag(MemoryRef);
+        /**
+         * While this operation doesn't actually use DS storage we classify
+         * it as a load here because it does a writeback to a VGPR, which
+         * fits in better with the LDS pipeline logic.
+         */
+         setFlag(Load);
+    } // Inst_DS__DS_PERMUTE_B32
+
+    Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32()
+    {
+    } // ~Inst_DS__DS_PERMUTE_B32
+
+    // --- description from .arch file ---
+    // Forward permute.
+    void
+    Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        wf->decLGKMInstsIssued();
+
+        if (gpuDynInst->exec_mask.none()) {
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()
+                                ->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        addr.read();
+        data.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                /**
+                 * One of the offset fields can be used for the index.
+                 * It is assumed OFFSET0 would be used, as OFFSET1 is
+                 * typically only used for DS ops that operate on two
+                 * disparate pieces of data.
+                 */
+                assert(!instData.OFFSET1);
+                /**
+                 * The address provided is a byte address, but VGPRs are
+                 * 4 bytes, so we must divide by 4 to get the actual VGPR
+                 * index. Additionally, the index is calculated modulo the
+                 * WF size, 64 in this case, so we simply extract bits 7-2.
+                 */
+                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
+                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
+                         "of bounds.\n", gpuDynInst->disassemble(), index);
+                /**
+                 * If the shuffled index corresponds to a lane that is
+                 * inactive then this instruction writes a 0 to the active
+                 * lane in VDST.
+                 */
+                if (wf->execMask(index)) {
+                    vdst[index] = data[lane];
+                } else {
+                    vdst[index] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
+    } // execute
+    // --- Inst_DS__DS_BPERMUTE_B32 class methods ---
+
+    Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_bpermute_b32")
+    {
+        setFlag(MemoryRef);
+        /**
+         * While this operation doesn't actually use DS storage we classify
+         * it as a load here because it does a writeback to a VGPR, which
+         * fits in better with the LDS pipeline logic.
+         */
+        setFlag(Load);
+    } // Inst_DS__DS_BPERMUTE_B32
+
+    Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32()
+    {
+    } // ~Inst_DS__DS_BPERMUTE_B32
+
+    // --- description from .arch file ---
+    // Backward permute.
+    void
+    Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        wf->decLGKMInstsIssued();
+
+        if (gpuDynInst->exec_mask.none()) {
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()
+                                ->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        addr.read();
+        data.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                /**
+                 * One of the offset fields can be used for the index.
+                 * It is assumed OFFSET0 would be used, as OFFSET1 is
+                 * typically only used for DS ops that operate on two
+                 * disparate pieces of data.
+                 */
+                assert(!instData.OFFSET1);
+                /**
+                 * The address provided is a byte address, but VGPRs are
+                 * 4 bytes, so we must divide by 4 to get the actual VGPR
+                 * index. Additionally, the index is calculated modulo the
+                 * WF size, 64 in this case, so we simply extract bits 7-2.
+                 */
+                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
+                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
+                         "of bounds.\n", gpuDynInst->disassemble(), index);
+                /**
+                 * If the shuffled index corresponds to a lane that is
+                 * inactive then this instruction writes a 0 to the active
+                 * lane in VDST.
+                 */
+                if (wf->execMask(index)) {
+                    vdst[lane] = data[index];
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
+        /**
+         * Similarly, this counter could build up over time, even across
+         * multiple wavefronts, and cause a deadlock.
+         */
+        wf->rdLmReqsInPipe--;
+    } // execute
+
+    // --- Inst_DS__DS_ADD_U64 class methods ---
+
+    Inst_DS__DS_ADD_U64::Inst_DS__DS_ADD_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_u64")
+    {
+        setFlag(MemoryRef);
+        setFlag(GroupSegment);
+        setFlag(AtomicAdd);
+        setFlag(AtomicNoReturn);
+    } // Inst_DS__DS_ADD_U64
+
+    Inst_DS__DS_ADD_U64::~Inst_DS__DS_ADD_U64()
+    {
+    } // ~Inst_DS__DS_ADD_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR] += DATA[0:1];
+    void
+    Inst_DS__DS_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_ADD_U64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initAtomicAccess<VecElemU64>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_ADD_U64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_SUB_U64 class methods ---
+
+    Inst_DS__DS_SUB_U64::Inst_DS__DS_SUB_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_u64")
+    {
+    } // Inst_DS__DS_SUB_U64
+
+    Inst_DS__DS_SUB_U64::~Inst_DS__DS_SUB_U64()
+    {
+    } // ~Inst_DS__DS_SUB_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_SUB_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_U64 class methods ---
+
+    Inst_DS__DS_RSUB_U64::Inst_DS__DS_RSUB_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_u64")
+    {
+    } // Inst_DS__DS_RSUB_U64
+
+    Inst_DS__DS_RSUB_U64::~Inst_DS__DS_RSUB_U64()
+    {
+    } // ~Inst_DS__DS_RSUB_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_U64 class methods ---
+
+    Inst_DS__DS_INC_U64::Inst_DS__DS_INC_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_u64")
+    {
+    } // Inst_DS__DS_INC_U64
+
+    Inst_DS__DS_INC_U64::~Inst_DS__DS_INC_U64()
+    {
+    } // ~Inst_DS__DS_INC_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_INC_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_U64 class methods ---
+
+    Inst_DS__DS_DEC_U64::Inst_DS__DS_DEC_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_u64")
+    {
+    } // Inst_DS__DS_DEC_U64
+
+    Inst_DS__DS_DEC_U64::~Inst_DS__DS_DEC_U64()
+    {
+    } // ~Inst_DS__DS_DEC_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_DEC_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_I64 class methods ---
+
+    Inst_DS__DS_MIN_I64::Inst_DS__DS_MIN_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_i64")
+    {
+    } // Inst_DS__DS_MIN_I64
+
+    Inst_DS__DS_MIN_I64::~Inst_DS__DS_MIN_I64()
+    {
+    } // ~Inst_DS__DS_MIN_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_I64 class methods ---
+
+    Inst_DS__DS_MAX_I64::Inst_DS__DS_MAX_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_i64")
+    {
+    } // Inst_DS__DS_MAX_I64
+
+    Inst_DS__DS_MAX_I64::~Inst_DS__DS_MAX_I64()
+    {
+    } // ~Inst_DS__DS_MAX_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_U64 class methods ---
+
+    Inst_DS__DS_MIN_U64::Inst_DS__DS_MIN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_u64")
+    {
+    } // Inst_DS__DS_MIN_U64
+
+    Inst_DS__DS_MIN_U64::~Inst_DS__DS_MIN_U64()
+    {
+    } // ~Inst_DS__DS_MIN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_U64 class methods ---
+
+    Inst_DS__DS_MAX_U64::Inst_DS__DS_MAX_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_u64")
+    {
+    } // Inst_DS__DS_MAX_U64
+
+    Inst_DS__DS_MAX_U64::~Inst_DS__DS_MAX_U64()
+    {
+    } // ~Inst_DS__DS_MAX_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_B64 class methods ---
+
+    Inst_DS__DS_AND_B64::Inst_DS__DS_AND_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_b64")
+    {
+    } // Inst_DS__DS_AND_B64
+
+    Inst_DS__DS_AND_B64::~Inst_DS__DS_AND_B64()
+    {
+    } // ~Inst_DS__DS_AND_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_AND_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_B64 class methods ---
+
+    Inst_DS__DS_OR_B64::Inst_DS__DS_OR_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_b64")
+    {
+    } // Inst_DS__DS_OR_B64
+
+    Inst_DS__DS_OR_B64::~Inst_DS__DS_OR_B64()
+    {
+    } // ~Inst_DS__DS_OR_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_OR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_B64 class methods ---
+
+    Inst_DS__DS_XOR_B64::Inst_DS__DS_XOR_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_b64")
+    {
+    } // Inst_DS__DS_XOR_B64
+
+    Inst_DS__DS_XOR_B64::~Inst_DS__DS_XOR_B64()
+    {
+    } // ~Inst_DS__DS_XOR_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_B64 class methods ---
+
+    Inst_DS__DS_MSKOR_B64::Inst_DS__DS_MSKOR_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_b64")
+    {
+    } // Inst_DS__DS_MSKOR_B64
+
+    Inst_DS__DS_MSKOR_B64::~Inst_DS__DS_MSKOR_B64()
+    {
+    } // ~Inst_DS__DS_MSKOR_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_B64 class methods ---
+
+    Inst_DS__DS_WRITE_B64::Inst_DS__DS_WRITE_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B64
+
+    Inst_DS__DS_WRITE_B64::~Inst_DS__DS_WRITE_B64()
+    {
+    } // ~Inst_DS__DS_WRITE_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR] = DATA.
+    // Write qword.
+    void
+    Inst_DS__DS_WRITE_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
+
+        addr.read();
+        data.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<VecElemU64>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE2_B64 class methods ---
+
+    Inst_DS__DS_WRITE2_B64::Inst_DS__DS_WRITE2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2_B64
+
+    Inst_DS__DS_WRITE2_B64::~Inst_DS__DS_WRITE2_B64()
+    {
+    } // ~Inst_DS__DS_WRITE2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR_BASE + OFFSET0 * 8] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 8] = DATA2.
+    // Write 2 qwords.
+    void
+    Inst_DS__DS_WRITE2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2] = data0[lane];
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 8;
+        Addr offset1 = instData.OFFSET1 * 8;
+
+        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_WRITE2ST64_B64 class methods ---
+
+    Inst_DS__DS_WRITE2ST64_B64::Inst_DS__DS_WRITE2ST64_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write2st64_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE2ST64_B64
+
+    Inst_DS__DS_WRITE2ST64_B64::~Inst_DS__DS_WRITE2ST64_B64()
+    {
+    } // ~Inst_DS__DS_WRITE2ST64_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // MEM[ADDR_BASE + OFFSET0 * 8 * 64] = DATA;
+    // MEM[ADDR_BASE + OFFSET1 * 8 * 64] = DATA2;
+    // Write 2 qwords.
+    void
+    Inst_DS__DS_WRITE2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
+
+        addr.read();
+        data0.read();
+        data1.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2] = data0[lane];
+                (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 8 * 64;
+        Addr offset1 = instData.OFFSET1 * 8 * 64;
+
+        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_WRITE2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    }
+    // --- Inst_DS__DS_CMPST_B64 class methods ---
+
+    Inst_DS__DS_CMPST_B64::Inst_DS__DS_CMPST_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_b64")
+    {
+    } // Inst_DS__DS_CMPST_B64
+
+    Inst_DS__DS_CMPST_B64::~Inst_DS__DS_CMPST_B64()
+    {
+    } // ~Inst_DS__DS_CMPST_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_F64 class methods ---
+
+    Inst_DS__DS_CMPST_F64::Inst_DS__DS_CMPST_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_CMPST_F64
+
+    Inst_DS__DS_CMPST_F64::~Inst_DS__DS_CMPST_F64()
+    {
+    } // ~Inst_DS__DS_CMPST_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_F64 class methods ---
+
+    Inst_DS__DS_MIN_F64::Inst_DS__DS_MIN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MIN_F64
+
+    Inst_DS__DS_MIN_F64::~Inst_DS__DS_MIN_F64()
+    {
+    } // ~Inst_DS__DS_MIN_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN_X2.
+    void
+    Inst_DS__DS_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_F64 class methods ---
+
+    Inst_DS__DS_MAX_F64::Inst_DS__DS_MAX_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MAX_F64
+
+    Inst_DS__DS_MAX_F64::~Inst_DS__DS_MAX_F64()
+    {
+    } // ~Inst_DS__DS_MAX_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX_X2.
+    void
+    Inst_DS__DS_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_RTN_U64 class methods ---
+
+    Inst_DS__DS_ADD_RTN_U64::Inst_DS__DS_ADD_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_rtn_u64")
+    {
+    } // Inst_DS__DS_ADD_RTN_U64
+
+    Inst_DS__DS_ADD_RTN_U64::~Inst_DS__DS_ADD_RTN_U64()
+    {
+    } // ~Inst_DS__DS_ADD_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_ADD_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_RTN_U64 class methods ---
+
+    Inst_DS__DS_SUB_RTN_U64::Inst_DS__DS_SUB_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_rtn_u64")
+    {
+    } // Inst_DS__DS_SUB_RTN_U64
+
+    Inst_DS__DS_SUB_RTN_U64::~Inst_DS__DS_SUB_RTN_U64()
+    {
+    } // ~Inst_DS__DS_SUB_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_SUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_RTN_U64 class methods ---
+
+    Inst_DS__DS_RSUB_RTN_U64::Inst_DS__DS_RSUB_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_rtn_u64")
+    {
+    } // Inst_DS__DS_RSUB_RTN_U64
+
+    Inst_DS__DS_RSUB_RTN_U64::~Inst_DS__DS_RSUB_RTN_U64()
+    {
+    } // ~Inst_DS__DS_RSUB_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA - MEM[ADDR];
+    // RETURN_DATA = tmp.
+    // Subtraction with reversed operands.
+    void
+    Inst_DS__DS_RSUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_RTN_U64 class methods ---
+
+    Inst_DS__DS_INC_RTN_U64::Inst_DS__DS_INC_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_rtn_u64")
+    {
+    } // Inst_DS__DS_INC_RTN_U64
+
+    Inst_DS__DS_INC_RTN_U64::~Inst_DS__DS_INC_RTN_U64()
+    {
+    } // ~Inst_DS__DS_INC_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_INC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_RTN_U64 class methods ---
+
+    Inst_DS__DS_DEC_RTN_U64::Inst_DS__DS_DEC_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_rtn_u64")
+    {
+    } // Inst_DS__DS_DEC_RTN_U64
+
+    Inst_DS__DS_DEC_RTN_U64::~Inst_DS__DS_DEC_RTN_U64()
+    {
+    } // ~Inst_DS__DS_DEC_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_DEC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_I64 class methods ---
+
+    Inst_DS__DS_MIN_RTN_I64::Inst_DS__DS_MIN_RTN_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_i64")
+    {
+    } // Inst_DS__DS_MIN_RTN_I64
+
+    Inst_DS__DS_MIN_RTN_I64::~Inst_DS__DS_MIN_RTN_I64()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_I64 class methods ---
+
+    Inst_DS__DS_MAX_RTN_I64::Inst_DS__DS_MAX_RTN_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_i64")
+    {
+    } // Inst_DS__DS_MAX_RTN_I64
+
+    Inst_DS__DS_MAX_RTN_I64::~Inst_DS__DS_MAX_RTN_I64()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_U64 class methods ---
+
+    Inst_DS__DS_MIN_RTN_U64::Inst_DS__DS_MIN_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_u64")
+    {
+    } // Inst_DS__DS_MIN_RTN_U64
+
+    Inst_DS__DS_MIN_RTN_U64::~Inst_DS__DS_MIN_RTN_U64()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MIN_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_U64 class methods ---
+
+    Inst_DS__DS_MAX_RTN_U64::Inst_DS__DS_MAX_RTN_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_u64")
+    {
+    } // Inst_DS__DS_MAX_RTN_U64
+
+    Inst_DS__DS_MAX_RTN_U64::~Inst_DS__DS_MAX_RTN_U64()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_MAX_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_RTN_B64 class methods ---
+
+    Inst_DS__DS_AND_RTN_B64::Inst_DS__DS_AND_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_rtn_b64")
+    {
+    } // Inst_DS__DS_AND_RTN_B64
+
+    Inst_DS__DS_AND_RTN_B64::~Inst_DS__DS_AND_RTN_B64()
+    {
+    } // ~Inst_DS__DS_AND_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_AND_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_RTN_B64 class methods ---
+
+    Inst_DS__DS_OR_RTN_B64::Inst_DS__DS_OR_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_rtn_b64")
+    {
+    } // Inst_DS__DS_OR_RTN_B64
+
+    Inst_DS__DS_OR_RTN_B64::~Inst_DS__DS_OR_RTN_B64()
+    {
+    } // ~Inst_DS__DS_OR_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_OR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_RTN_B64 class methods ---
+
+    Inst_DS__DS_XOR_RTN_B64::Inst_DS__DS_XOR_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_rtn_b64")
+    {
+    } // Inst_DS__DS_XOR_RTN_B64
+
+    Inst_DS__DS_XOR_RTN_B64::~Inst_DS__DS_XOR_RTN_B64()
+    {
+    } // ~Inst_DS__DS_XOR_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_DS__DS_XOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MSKOR_RTN_B64 class methods ---
+
+    Inst_DS__DS_MSKOR_RTN_B64::Inst_DS__DS_MSKOR_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_mskor_rtn_b64")
+    {
+    } // Inst_DS__DS_MSKOR_RTN_B64
+
+    Inst_DS__DS_MSKOR_RTN_B64::~Inst_DS__DS_MSKOR_RTN_B64()
+    {
+    } // ~Inst_DS__DS_MSKOR_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
+    // RETURN_DATA = tmp.
+    // Masked dword OR, D0 contains the mask and D1 contains the new value.
+    void
+    Inst_DS__DS_MSKOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG_RTN_B64 class methods ---
+
+    Inst_DS__DS_WRXCHG_RTN_B64::Inst_DS__DS_WRXCHG_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg_rtn_b64")
+    {
+    } // Inst_DS__DS_WRXCHG_RTN_B64
+
+    Inst_DS__DS_WRXCHG_RTN_B64::~Inst_DS__DS_WRXCHG_RTN_B64()
+    {
+    } // ~Inst_DS__DS_WRXCHG_RTN_B64
+
+    // --- description from .arch file ---
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    // Write-exchange operation.
+    void
+    Inst_DS__DS_WRXCHG_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2_RTN_B64 class methods ---
+
+    Inst_DS__DS_WRXCHG2_RTN_B64::Inst_DS__DS_WRXCHG2_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b64")
+    {
+    } // Inst_DS__DS_WRXCHG2_RTN_B64
+
+    Inst_DS__DS_WRXCHG2_RTN_B64::~Inst_DS__DS_WRXCHG2_RTN_B64()
+    {
+    } // ~Inst_DS__DS_WRXCHG2_RTN_B64
+
+    // --- description from .arch file ---
+    // Write-exchange 2 separate qwords.
+    void
+    Inst_DS__DS_WRXCHG2_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B64 class methods ---
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B64::Inst_DS__DS_WRXCHG2ST64_RTN_B64(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b64")
+    {
+    } // Inst_DS__DS_WRXCHG2ST64_RTN_B64
+
+    Inst_DS__DS_WRXCHG2ST64_RTN_B64::~Inst_DS__DS_WRXCHG2ST64_RTN_B64()
+    {
+    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B64
+
+    // --- description from .arch file ---
+    // Write-exchange 2 qwords with a stride of 64 qwords.
+    void
+    Inst_DS__DS_WRXCHG2ST64_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_B64 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_B64::Inst_DS__DS_CMPST_RTN_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_b64")
+    {
+    } // Inst_DS__DS_CMPST_RTN_B64
+
+    Inst_DS__DS_CMPST_RTN_B64::~Inst_DS__DS_CMPST_RTN_B64()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Compare and store.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CMPST_RTN_F64 class methods ---
+
+    Inst_DS__DS_CMPST_RTN_F64::Inst_DS__DS_CMPST_RTN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_cmpst_rtn_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_CMPST_RTN_F64
+
+    Inst_DS__DS_CMPST_RTN_F64::~Inst_DS__DS_CMPST_RTN_F64()
+    {
+    } // ~Inst_DS__DS_CMPST_RTN_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA2;
+    // cmp = DATA;
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    // Floating point compare and store that handles NaN/INF/denormal values.
+    // Caution, the order of src and cmp are the *opposite* of the
+    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
+    void
+    Inst_DS__DS_CMPST_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_RTN_F64 class methods ---
+
+    Inst_DS__DS_MIN_RTN_F64::Inst_DS__DS_MIN_RTN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_rtn_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MIN_RTN_F64
+
+    Inst_DS__DS_MIN_RTN_F64::~Inst_DS__DS_MIN_RTN_F64()
+    {
+    } // ~Inst_DS__DS_MIN_RTN_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
+    // Floating point minimum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMIN_X2.
+    void
+    Inst_DS__DS_MIN_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_RTN_F64 class methods ---
+
+    Inst_DS__DS_MAX_RTN_F64::Inst_DS__DS_MAX_RTN_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_rtn_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MAX_RTN_F64
+
+    Inst_DS__DS_MAX_RTN_F64::~Inst_DS__DS_MAX_RTN_F64()
+    {
+    } // ~Inst_DS__DS_MAX_RTN_F64
+
+    // --- description from .arch file ---
+    // 64b.
+    // tmp = MEM[ADDR];
+    // src = DATA;
+    // cmp = DATA2;
+    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
+    // Floating point maximum that handles NaN/INF/denormal values.
+    // Note that this opcode is slightly more general-purpose than
+    // ---  BUFFER_ATOMIC_FMAX_X2.
+    void
+    Inst_DS__DS_MAX_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_READ_B64 class methods ---
+
+    Inst_DS__DS_READ_B64::Inst_DS__DS_READ_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B64
+
+    Inst_DS__DS_READ_B64::~Inst_DS__DS_READ_B64()
+    {
+    } // ~Inst_DS__DS_READ_B64
+
+    // --- description from .arch file ---
+    // RETURN_DATA = MEM[ADDR].
+    // Read 1 qword.
+    void
+    Inst_DS__DS_READ_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<VecElemU64>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2_B64 class methods ---
+
+    Inst_DS__DS_READ2_B64::Inst_DS__DS_READ2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2_B64
+
+    Inst_DS__DS_READ2_B64::~Inst_DS__DS_READ2_B64()
+    {
+    } // ~Inst_DS__DS_READ2_B64
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8].
+    // Read 2 qwords.
+    void
+    Inst_DS__DS_READ2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0 * 8;
+        Addr offset1 = instData.OFFSET1 * 8;
+
+        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    } // completeAcc
+    // --- Inst_DS__DS_READ2ST64_B64 class methods ---
+
+    Inst_DS__DS_READ2ST64_B64::Inst_DS__DS_READ2ST64_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read2st64_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ2ST64_B64
+
+    Inst_DS__DS_READ2ST64_B64::~Inst_DS__DS_READ2ST64_B64()
+    {
+    } // ~Inst_DS__DS_READ2ST64_B64
+
+    // --- description from .arch file ---
+    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8 * 64];
+    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8 * 64].
+    // Read 2 qwords.
+    void
+    Inst_DS__DS_READ2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = (instData.OFFSET0 * 8 * 64);
+        Addr offset1 = (instData.OFFSET1 * 8 * 64);
+
+        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
+    }
+
+    void
+    Inst_DS__DS_READ2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2];
+                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane * 2 + 1];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    }
+    // --- Inst_DS__DS_CONDXCHG32_RTN_B64 class methods ---
+
+    Inst_DS__DS_CONDXCHG32_RTN_B64::Inst_DS__DS_CONDXCHG32_RTN_B64(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_condxchg32_rtn_b64")
+    {
+    } // Inst_DS__DS_CONDXCHG32_RTN_B64
+
+    Inst_DS__DS_CONDXCHG32_RTN_B64::~Inst_DS__DS_CONDXCHG32_RTN_B64()
+    {
+    } // ~Inst_DS__DS_CONDXCHG32_RTN_B64
+
+    // --- description from .arch file ---
+    // Conditional write exchange.
+    void
+    Inst_DS__DS_CONDXCHG32_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_SRC2_U32 class methods ---
+
+    Inst_DS__DS_ADD_SRC2_U32::Inst_DS__DS_ADD_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_src2_u32")
+    {
+    } // Inst_DS__DS_ADD_SRC2_U32
+
+    Inst_DS__DS_ADD_SRC2_U32::~Inst_DS__DS_ADD_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_ADD_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] + MEM[B].
+    void
+    Inst_DS__DS_ADD_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_SRC2_U32 class methods ---
+
+    Inst_DS__DS_SUB_SRC2_U32::Inst_DS__DS_SUB_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_src2_u32")
+    {
+    } // Inst_DS__DS_SUB_SRC2_U32
+
+    Inst_DS__DS_SUB_SRC2_U32::~Inst_DS__DS_SUB_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_SUB_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] - MEM[B].
+    void
+    Inst_DS__DS_SUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_SRC2_U32 class methods ---
+
+    Inst_DS__DS_RSUB_SRC2_U32::Inst_DS__DS_RSUB_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_src2_u32")
+    {
+    } // Inst_DS__DS_RSUB_SRC2_U32
+
+    Inst_DS__DS_RSUB_SRC2_U32::~Inst_DS__DS_RSUB_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_RSUB_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B] - MEM[A].
+    void
+    Inst_DS__DS_RSUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_SRC2_U32 class methods ---
+
+    Inst_DS__DS_INC_SRC2_U32::Inst_DS__DS_INC_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_src2_u32")
+    {
+    } // Inst_DS__DS_INC_SRC2_U32
+
+    Inst_DS__DS_INC_SRC2_U32::~Inst_DS__DS_INC_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_INC_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
+    void
+    Inst_DS__DS_INC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_SRC2_U32 class methods ---
+
+    Inst_DS__DS_DEC_SRC2_U32::Inst_DS__DS_DEC_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_src2_u32")
+    {
+    } // Inst_DS__DS_DEC_SRC2_U32
+
+    Inst_DS__DS_DEC_SRC2_U32::~Inst_DS__DS_DEC_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_DEC_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
+    // Uint decrement.
+    void
+    Inst_DS__DS_DEC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_I32 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_I32::Inst_DS__DS_MIN_SRC2_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_i32")
+    {
+    } // Inst_DS__DS_MIN_SRC2_I32
+
+    Inst_DS__DS_MIN_SRC2_I32::~Inst_DS__DS_MIN_SRC2_I32()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_I32 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_I32::Inst_DS__DS_MAX_SRC2_I32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_i32")
+    {
+    } // Inst_DS__DS_MAX_SRC2_I32
+
+    Inst_DS__DS_MAX_SRC2_I32::~Inst_DS__DS_MAX_SRC2_I32()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_I32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_U32 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_U32::Inst_DS__DS_MIN_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_u32")
+    {
+    } // Inst_DS__DS_MIN_SRC2_U32
+
+    Inst_DS__DS_MIN_SRC2_U32::~Inst_DS__DS_MIN_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_U32 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_U32::Inst_DS__DS_MAX_SRC2_U32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_u32")
+    {
+    } // Inst_DS__DS_MAX_SRC2_U32
+
+    Inst_DS__DS_MAX_SRC2_U32::~Inst_DS__DS_MAX_SRC2_U32()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_U32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_SRC2_B32 class methods ---
+
+    Inst_DS__DS_AND_SRC2_B32::Inst_DS__DS_AND_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_src2_b32")
+    {
+    } // Inst_DS__DS_AND_SRC2_B32
+
+    Inst_DS__DS_AND_SRC2_B32::~Inst_DS__DS_AND_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_AND_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] & MEM[B].
+    void
+    Inst_DS__DS_AND_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_SRC2_B32 class methods ---
+
+    Inst_DS__DS_OR_SRC2_B32::Inst_DS__DS_OR_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_src2_b32")
+    {
+    } // Inst_DS__DS_OR_SRC2_B32
+
+    Inst_DS__DS_OR_SRC2_B32::~Inst_DS__DS_OR_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_OR_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] | MEM[B].
+    void
+    Inst_DS__DS_OR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_SRC2_B32 class methods ---
+
+    Inst_DS__DS_XOR_SRC2_B32::Inst_DS__DS_XOR_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_src2_b32")
+    {
+    } // Inst_DS__DS_XOR_SRC2_B32
+
+    Inst_DS__DS_XOR_SRC2_B32::~Inst_DS__DS_XOR_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_XOR_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] ^ MEM[B].
+    void
+    Inst_DS__DS_XOR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_SRC2_B32 class methods ---
+
+    Inst_DS__DS_WRITE_SRC2_B32::Inst_DS__DS_WRITE_SRC2_B32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_src2_b32")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_SRC2_B32
+
+    Inst_DS__DS_WRITE_SRC2_B32::~Inst_DS__DS_WRITE_SRC2_B32()
+    {
+    } // ~Inst_DS__DS_WRITE_SRC2_B32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B].
+    // Write dword.
+    void
+    Inst_DS__DS_WRITE_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_F32 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_F32::Inst_DS__DS_MIN_SRC2_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MIN_SRC2_F32
+
+    Inst_DS__DS_MIN_SRC2_F32::~Inst_DS__DS_MIN_SRC2_F32()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MIN_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_F32 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_F32::Inst_DS__DS_MAX_SRC2_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_MAX_SRC2_F32
+
+    Inst_DS__DS_MAX_SRC2_F32::~Inst_DS__DS_MAX_SRC2_F32()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MAX_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_SRC2_F32 class methods ---
+
+    Inst_DS__DS_ADD_SRC2_F32::Inst_DS__DS_ADD_SRC2_F32(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_src2_f32")
+    {
+        setFlag(F32);
+    } // Inst_DS__DS_ADD_SRC2_F32
+
+    Inst_DS__DS_ADD_SRC2_F32::~Inst_DS__DS_ADD_SRC2_F32()
+    {
+    } // ~Inst_DS__DS_ADD_SRC2_F32
+
+    // --- description from .arch file ---
+    // 32b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B] + MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_ADD_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_RELEASE_ALL class methods ---
+
+    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::Inst_DS__DS_GWS_SEMA_RELEASE_ALL(
+          InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_release_all")
+    {
+    } // Inst_DS__DS_GWS_SEMA_RELEASE_ALL
+
+    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::~Inst_DS__DS_GWS_SEMA_RELEASE_ALL()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_RELEASE_ALL
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource (rid) indicated will process this opcode by
+    // updating the counter and labeling the specified resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // //Incr the state counter of the resource
+    // state.counter[rid] = state.wave_in_queue;
+    // state.type = SEMAPHORE;
+    // return rd_done; //release calling wave
+    // This action will release ALL queued waves; it Will have no effect if no
+    // ---  waves are present.
+    void
+    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_INIT class methods ---
+
+    Inst_DS__DS_GWS_INIT::Inst_DS__DS_GWS_INIT(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_init")
+    {
+    } // Inst_DS__DS_GWS_INIT
+
+    Inst_DS__DS_GWS_INIT::~Inst_DS__DS_GWS_INIT()
+    {
+    } // ~Inst_DS__DS_GWS_INIT
+
+    // --- description from .arch file ---
+    // GDS Only: Initialize a barrier or semaphore resource.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // //Get the value to use in init
+    // index = find_first_valid(vector mask)
+    // value = DATA[thread: index]
+    // //Set the state of the resource
+    // state.counter[rid] = lsb(value); //limit #waves
+    // state.flag[rid] = 0;
+    // return rd_done; //release calling wave
+    void
+    Inst_DS__DS_GWS_INIT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_V class methods ---
+
+    Inst_DS__DS_GWS_SEMA_V::Inst_DS__DS_GWS_SEMA_V(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_v")
+    {
+    } // Inst_DS__DS_GWS_SEMA_V
+
+    Inst_DS__DS_GWS_SEMA_V::~Inst_DS__DS_GWS_SEMA_V()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_V
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // updating the counter and labeling the resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // //Incr the state counter of the resource
+    // state.counter[rid]++;
+    // state.type = SEMAPHORE;
+    // return rd_done; //release calling wave
+    // This action will release one waved if any are queued in this resource.
+    void
+    Inst_DS__DS_GWS_SEMA_V::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_BR class methods ---
+
+    Inst_DS__DS_GWS_SEMA_BR::Inst_DS__DS_GWS_SEMA_BR(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_br")
+    {
+    } // Inst_DS__DS_GWS_SEMA_BR
+
+    Inst_DS__DS_GWS_SEMA_BR::~Inst_DS__DS_GWS_SEMA_BR()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_BR
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // updating the counter by the bulk release delivered count and labeling
+    // the resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // index =  find first valid (vector mask)
+    // count = DATA[thread: index];
+    // //Add count to the resource state counter
+    // state.counter[rid] += count;
+    // state.type = SEMAPHORE;
+    // return rd_done; //release calling wave
+    // This action will release count number of waves, immediately if queued,
+    // or as they arrive from the noted resource.
+    void
+    Inst_DS__DS_GWS_SEMA_BR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_SEMA_P class methods ---
+
+    Inst_DS__DS_GWS_SEMA_P::Inst_DS__DS_GWS_SEMA_P(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_sema_p")
+    {
+    } // Inst_DS__DS_GWS_SEMA_P
+
+    Inst_DS__DS_GWS_SEMA_P::~Inst_DS__DS_GWS_SEMA_P()
+    {
+    } // ~Inst_DS__DS_GWS_SEMA_P
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // queueing it until counter enables a release and then decrementing the
+    // counter of the resource as a semaphore.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
+    // state.type = SEMAPHORE;
+    // ENQUEUE until(state[rid].counter > 0)
+    // state[rid].counter--;
+    // return rd_done
+    void
+    Inst_DS__DS_GWS_SEMA_P::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_GWS_BARRIER class methods ---
+
+    Inst_DS__DS_GWS_BARRIER::Inst_DS__DS_GWS_BARRIER(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_gws_barrier")
+    {
+    } // Inst_DS__DS_GWS_BARRIER
+
+    Inst_DS__DS_GWS_BARRIER::~Inst_DS__DS_GWS_BARRIER()
+    {
+    } // ~Inst_DS__DS_GWS_BARRIER
+
+    // --- description from .arch file ---
+    // GDS Only: The GWS resource indicated will process this opcode by
+    // queueing it until barrier is satisfied. The number of waves needed is
+    // passed in as DATA of first valid thread.
+    // //Determine the GWS resource to work on
+    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + OFFSET0[5:0];
+    // index =  find first valid (vector mask);
+    // value = DATA[thread: index];
+    // // Input Decision Machine
+    // state.type[rid] = BARRIER;
+    // if (state[rid].counter <= 0) {
+    //     thread[rid].flag = state[rid].flag;
+    //     ENQUEUE;
+    //     state[rid].flag = !state.flag;
+    //     state[rid].counter = value;
+    //     return rd_done;
+    // } else {
+    //     state[rid].counter--;
+    //     thread.flag = state[rid].flag;
+    //     ENQUEUE;
+    // }
+    // Since the waves deliver the count for the next barrier, this function
+    // can have a different size barrier for each occurrence.
+    // // Release Machine
+    // if (state.type == BARRIER) {
+    //     if (state.flag != thread.flag) {
+    //         return rd_done;
+    //     }
+    // }
+    void
+    Inst_DS__DS_GWS_BARRIER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_CONSUME class methods ---
+
+    Inst_DS__DS_CONSUME::Inst_DS__DS_CONSUME(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_consume")
+    {
+    } // Inst_DS__DS_CONSUME
+
+    Inst_DS__DS_CONSUME::~Inst_DS__DS_CONSUME()
+    {
+    } // ~Inst_DS__DS_CONSUME
+
+    // --- description from .arch file ---
+    // LDS & GDS. Subtract (count_bits(exec_mask)) from the value stored in DS
+    // memory at (M0.base + instr_offset). Return the pre-operation value to
+    // VGPRs.
+    void
+    Inst_DS__DS_CONSUME::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_APPEND class methods ---
+
+    Inst_DS__DS_APPEND::Inst_DS__DS_APPEND(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_append")
+    {
+    } // Inst_DS__DS_APPEND
+
+    Inst_DS__DS_APPEND::~Inst_DS__DS_APPEND()
+    {
+    } // ~Inst_DS__DS_APPEND
+
+    // --- description from .arch file ---
+    // LDS & GDS. Add (count_bits(exec_mask)) to the value stored in DS memory
+    // at (M0.base + instr_offset). Return the pre-operation value to VGPRs.
+    void
+    Inst_DS__DS_APPEND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ORDERED_COUNT class methods ---
+
+    Inst_DS__DS_ORDERED_COUNT::Inst_DS__DS_ORDERED_COUNT(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_ordered_count")
+    {
+    } // Inst_DS__DS_ORDERED_COUNT
+
+    Inst_DS__DS_ORDERED_COUNT::~Inst_DS__DS_ORDERED_COUNT()
+    {
+    } // ~Inst_DS__DS_ORDERED_COUNT
+
+    // --- description from .arch file ---
+    // GDS-only. Add (count_bits(exec_mask)) to one of 4 dedicated
+    // ordered-count counters (aka 'packers'). Additional bits of instr.offset
+    // field are overloaded to hold packer-id, 'last'.
+    void
+    Inst_DS__DS_ORDERED_COUNT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_ADD_SRC2_U64 class methods ---
+
+    Inst_DS__DS_ADD_SRC2_U64::Inst_DS__DS_ADD_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_add_src2_u64")
+    {
+    } // Inst_DS__DS_ADD_SRC2_U64
+
+    Inst_DS__DS_ADD_SRC2_U64::~Inst_DS__DS_ADD_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_ADD_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] + MEM[B].
+    void
+    Inst_DS__DS_ADD_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_SUB_SRC2_U64 class methods ---
+
+    Inst_DS__DS_SUB_SRC2_U64::Inst_DS__DS_SUB_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_sub_src2_u64")
+    {
+    } // Inst_DS__DS_SUB_SRC2_U64
+
+    Inst_DS__DS_SUB_SRC2_U64::~Inst_DS__DS_SUB_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_SUB_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] - MEM[B].
+    void
+    Inst_DS__DS_SUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_RSUB_SRC2_U64 class methods ---
+
+    Inst_DS__DS_RSUB_SRC2_U64::Inst_DS__DS_RSUB_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_rsub_src2_u64")
+    {
+    } // Inst_DS__DS_RSUB_SRC2_U64
+
+    Inst_DS__DS_RSUB_SRC2_U64::~Inst_DS__DS_RSUB_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_RSUB_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B] - MEM[A].
+    void
+    Inst_DS__DS_RSUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_INC_SRC2_U64 class methods ---
+
+    Inst_DS__DS_INC_SRC2_U64::Inst_DS__DS_INC_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_inc_src2_u64")
+    {
+    } // Inst_DS__DS_INC_SRC2_U64
+
+    Inst_DS__DS_INC_SRC2_U64::~Inst_DS__DS_INC_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_INC_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
+    void
+    Inst_DS__DS_INC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_DEC_SRC2_U64 class methods ---
+
+    Inst_DS__DS_DEC_SRC2_U64::Inst_DS__DS_DEC_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_dec_src2_u64")
+    {
+    } // Inst_DS__DS_DEC_SRC2_U64
+
+    Inst_DS__DS_DEC_SRC2_U64::~Inst_DS__DS_DEC_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_DEC_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
+    // Uint decrement.
+    void
+    Inst_DS__DS_DEC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_I64 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_I64::Inst_DS__DS_MIN_SRC2_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_i64")
+    {
+    } // Inst_DS__DS_MIN_SRC2_I64
+
+    Inst_DS__DS_MIN_SRC2_I64::~Inst_DS__DS_MIN_SRC2_I64()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_I64 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_I64::Inst_DS__DS_MAX_SRC2_I64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_i64")
+    {
+    } // Inst_DS__DS_MAX_SRC2_I64
+
+    Inst_DS__DS_MAX_SRC2_I64::~Inst_DS__DS_MAX_SRC2_I64()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_I64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_U64 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_U64::Inst_DS__DS_MIN_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_u64")
+    {
+    } // Inst_DS__DS_MIN_SRC2_U64
+
+    Inst_DS__DS_MIN_SRC2_U64::~Inst_DS__DS_MIN_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = min(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MIN_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_U64 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_U64::Inst_DS__DS_MAX_SRC2_U64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_u64")
+    {
+    } // Inst_DS__DS_MAX_SRC2_U64
+
+    Inst_DS__DS_MAX_SRC2_U64::~Inst_DS__DS_MAX_SRC2_U64()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_U64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = max(MEM[A], MEM[B]).
+    void
+    Inst_DS__DS_MAX_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_AND_SRC2_B64 class methods ---
+
+    Inst_DS__DS_AND_SRC2_B64::Inst_DS__DS_AND_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_and_src2_b64")
+    {
+    } // Inst_DS__DS_AND_SRC2_B64
+
+    Inst_DS__DS_AND_SRC2_B64::~Inst_DS__DS_AND_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_AND_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] & MEM[B].
+    void
+    Inst_DS__DS_AND_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_OR_SRC2_B64 class methods ---
+
+    Inst_DS__DS_OR_SRC2_B64::Inst_DS__DS_OR_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_or_src2_b64")
+    {
+    } // Inst_DS__DS_OR_SRC2_B64
+
+    Inst_DS__DS_OR_SRC2_B64::~Inst_DS__DS_OR_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_OR_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] | MEM[B].
+    void
+    Inst_DS__DS_OR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_XOR_SRC2_B64 class methods ---
+
+    Inst_DS__DS_XOR_SRC2_B64::Inst_DS__DS_XOR_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_xor_src2_b64")
+    {
+    } // Inst_DS__DS_XOR_SRC2_B64
+
+    Inst_DS__DS_XOR_SRC2_B64::~Inst_DS__DS_XOR_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_XOR_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[A] ^ MEM[B].
+    void
+    Inst_DS__DS_XOR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_SRC2_B64 class methods ---
+
+    Inst_DS__DS_WRITE_SRC2_B64::Inst_DS__DS_WRITE_SRC2_B64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_src2_b64")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_SRC2_B64
+
+    Inst_DS__DS_WRITE_SRC2_B64::~Inst_DS__DS_WRITE_SRC2_B64()
+    {
+    } // ~Inst_DS__DS_WRITE_SRC2_B64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = MEM[B].
+    // Write qword.
+    void
+    Inst_DS__DS_WRITE_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MIN_SRC2_F64 class methods ---
+
+    Inst_DS__DS_MIN_SRC2_F64::Inst_DS__DS_MIN_SRC2_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_min_src2_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MIN_SRC2_F64
+
+    Inst_DS__DS_MIN_SRC2_F64::~Inst_DS__DS_MIN_SRC2_F64()
+    {
+    } // ~Inst_DS__DS_MIN_SRC2_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MIN_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_MAX_SRC2_F64 class methods ---
+
+    Inst_DS__DS_MAX_SRC2_F64::Inst_DS__DS_MAX_SRC2_F64(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_max_src2_f64")
+    {
+        setFlag(F64);
+    } // Inst_DS__DS_MAX_SRC2_F64
+
+    Inst_DS__DS_MAX_SRC2_F64::~Inst_DS__DS_MAX_SRC2_F64()
+    {
+    } // ~Inst_DS__DS_MAX_SRC2_F64
+
+    // --- description from .arch file ---
+    // 64b:
+    // A = ADDR_BASE;
+    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
+    // ---  {offset1[6],offset1[6:0],offset0});
+    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
+    // Float, handles NaN/INF/denorm.
+    void
+    Inst_DS__DS_MAX_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_DS__DS_WRITE_B96 class methods ---
+
+    Inst_DS__DS_WRITE_B96::Inst_DS__DS_WRITE_B96(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b96")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B96
+
+    Inst_DS__DS_WRITE_B96::~Inst_DS__DS_WRITE_B96()
+    {
+    } // ~Inst_DS__DS_WRITE_B96
+
+    // --- description from .arch file ---
+    // {MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[95:0].
+    // Tri-dword write.
+    void
+    Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+
+        addr.read();
+        data0.read();
+        data1.read();
+        data2.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<3>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_WRITE_B128 class methods ---
+
+    Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_write_b128")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_DS__DS_WRITE_B128
+
+    Inst_DS__DS_WRITE_B128::~Inst_DS__DS_WRITE_B128()
+    {
+    } // ~Inst_DS__DS_WRITE_B128
+
+    // --- description from .arch file ---
+    // {MEM[ADDR + 12], MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[127:0].
+    // Qword write.
+    void
+    Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
+
+        addr.read();
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<4>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_DS__DS_READ_B96 class methods ---
+
+    Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b96")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B96
+
+    Inst_DS__DS_READ_B96::~Inst_DS__DS_READ_B96()
+    {
+    } // ~Inst_DS__DS_READ_B96
+
+    // --- description from .arch file ---
+    // Tri-dword read.
+    void
+    Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<3>(gpuDynInst, offset);
+    }
+
+    void
+    Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+    }
+    // --- Inst_DS__DS_READ_B128 class methods ---
+
+    Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
+        : Inst_DS(iFmt, "ds_read_b128")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_DS__DS_READ_B128
+
+    Inst_DS__DS_READ_B128::~Inst_DS__DS_READ_B128()
+    {
+    } // ~Inst_DS__DS_READ_B128
+
+    // --- description from .arch file ---
+    // Qword read.
+    void
+    Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<4>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+        vdst3.write();
+    } // completeAcc
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/gcn3/insts/gpu_static_inst.cc b/src/arch/amdgpu/vega/insts/exp.cc
similarity index 72%
rename from src/arch/amdgpu/gcn3/insts/gpu_static_inst.cc
rename to src/arch/amdgpu/vega/insts/exp.cc
index 8e5310843e..31b6ded10f 100644
--- a/src/arch/amdgpu/gcn3/insts/gpu_static_inst.cc
+++ b/src/arch/amdgpu/vega/insts/exp.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,31 +29,30 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
-
-#include "arch/amdgpu/gcn3/gpu_decoder.hh"
-#include "arch/amdgpu/gcn3/insts/instructions.hh"
-#include "debug/GPUExec.hh"
-#include "gpu-compute/shader.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
 
 namespace gem5
 {
 
-namespace Gcn3ISA
+namespace VegaISA
 {
-    GCN3GPUStaticInst::GCN3GPUStaticInst(const std::string &opcode)
-        : GPUStaticInst(opcode), _srcLiteral(0)
-    {
-    }
+    // --- Inst_EXP__EXP class methods ---
 
-    GCN3GPUStaticInst::~GCN3GPUStaticInst()
+    Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
+        : Inst_EXP(iFmt, "exp")
     {
-    }
+    } // Inst_EXP__EXP
 
+    Inst_EXP__EXP::~Inst_EXP__EXP()
+    {
+    } // ~Inst_EXP__EXP
+
+    // --- description from .arch file ---
+    // Export through SX.
     void
-    GCN3GPUStaticInst::panicUnimplemented() const
+    Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
     {
-        fatal("Encountered unimplemented GCN3 instruction: %s\n", _opcode);
-    }
-} // namespace Gcn3ISA
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
 } // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/flat.cc b/src/arch/amdgpu/vega/insts/flat.cc
new file mode 100644
index 0000000000..8dce8d4299
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/flat.cc
@@ -0,0 +1,2164 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_FLAT__FLAT_LOAD_UBYTE class methods ---
+
+    Inst_FLAT__FLAT_LOAD_UBYTE::Inst_FLAT__FLAT_LOAD_UBYTE(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_ubyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_UBYTE
+
+    Inst_FLAT__FLAT_LOAD_UBYTE::~Inst_FLAT__FLAT_LOAD_UBYTE()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_UBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
+                    gpuDynInst->d_data))[lane]);
+            }
+        }
+        vdst.write();
+    } // execute
+    // --- Inst_FLAT__FLAT_LOAD_SBYTE class methods ---
+
+    Inst_FLAT__FLAT_LOAD_SBYTE::Inst_FLAT__FLAT_LOAD_SBYTE(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_sbyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_SBYTE
+
+    Inst_FLAT__FLAT_LOAD_SBYTE::~Inst_FLAT__FLAT_LOAD_SBYTE()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_SBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed byte (sign extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemI8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemI32)((reinterpret_cast<VecElemI8*>(
+                    gpuDynInst->d_data))[lane]);
+            }
+        }
+        vdst.write();
+    } // execute
+    // --- Inst_FLAT__FLAT_LOAD_USHORT class methods ---
+
+    Inst_FLAT__FLAT_LOAD_USHORT::Inst_FLAT__FLAT_LOAD_USHORT(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_ushort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_USHORT
+
+    Inst_FLAT__FLAT_LOAD_USHORT::~Inst_FLAT__FLAT_LOAD_USHORT()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_USHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned short (zero extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
+                    gpuDynInst->d_data))[lane]);
+            }
+        }
+        vdst.write();
+    } // execute
+
+    // --- Inst_FLAT__FLAT_LOAD_SSHORT class methods ---
+
+    Inst_FLAT__FLAT_LOAD_SSHORT::Inst_FLAT__FLAT_LOAD_SSHORT(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_sshort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_SSHORT
+
+    Inst_FLAT__FLAT_LOAD_SSHORT::~Inst_FLAT__FLAT_LOAD_SSHORT()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_SSHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed short (sign extend to VGPR destination).
+    void
+    Inst_FLAT__FLAT_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_FLAT__FLAT_LOAD_DWORD class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORD::Inst_FLAT__FLAT_LOAD_DWORD(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORD
+
+    Inst_FLAT__FLAT_LOAD_DWORD::~Inst_FLAT__FLAT_LOAD_DWORD()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer load dword.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+        vdst.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_LOAD_DWORDX2 class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORDX2::Inst_FLAT__FLAT_LOAD_DWORDX2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORDX2
+
+    Inst_FLAT__FLAT_LOAD_DWORDX2::~Inst_FLAT__FLAT_LOAD_DWORDX2()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU64 vdst(gpuDynInst, extData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst[lane] = (reinterpret_cast<VecElemU64*>(
+                    gpuDynInst->d_data))[lane];
+            }
+        }
+        vdst.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_LOAD_DWORDX3 class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORDX3::Inst_FLAT__FLAT_LOAD_DWORDX3(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORDX3
+
+    Inst_FLAT__FLAT_LOAD_DWORDX3::~Inst_FLAT__FLAT_LOAD_DWORDX3()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 2];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_LOAD_DWORDX4 class methods ---
+
+    Inst_FLAT__FLAT_LOAD_DWORDX4::Inst_FLAT__FLAT_LOAD_DWORDX4(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_FLAT__FLAT_LOAD_DWORDX4
+
+    Inst_FLAT__FLAT_LOAD_DWORDX4::~Inst_FLAT__FLAT_LOAD_DWORDX4()
+    {
+    } // ~Inst_FLAT__FLAT_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords.
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+        vdst3.write();
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_BYTE class methods ---
+
+    Inst_FLAT__FLAT_STORE_BYTE::Inst_FLAT__FLAT_STORE_BYTE(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_byte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_BYTE
+
+    Inst_FLAT__FLAT_STORE_BYTE::~Inst_FLAT__FLAT_STORE_BYTE()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_BYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer store byte.
+    void
+    Inst_FLAT__FLAT_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU8 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_FLAT__FLAT_STORE_SHORT class methods ---
+
+    Inst_FLAT__FLAT_STORE_SHORT::Inst_FLAT__FLAT_STORE_SHORT(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_short")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_SHORT
+
+    Inst_FLAT__FLAT_STORE_SHORT::~Inst_FLAT__FLAT_STORE_SHORT()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_SHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer store short.
+    void
+    Inst_FLAT__FLAT_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU16 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_SHORT_D16_HI class methods ---
+
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::
+        Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_short_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
+
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::~Inst_FLAT__FLAT_STORE_SHORT_D16_HI()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI
+
+    // --- description from .arch file ---
+    // Untyped buffer store short.
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
+                    = (data[lane] >> 16);
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORD class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORD
+
+    Inst_FLAT__FLAT_STORE_DWORD::~Inst_FLAT__FLAT_STORE_DWORD()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer store dword.
+    void
+    Inst_FLAT__FLAT_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORDX2 class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORDX2::Inst_FLAT__FLAT_STORE_DWORDX2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORDX2
+
+    Inst_FLAT__FLAT_STORE_DWORDX2::~Inst_FLAT__FLAT_STORE_DWORDX2()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords.
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
+
+        data.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORDX3 class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORDX3::Inst_FLAT__FLAT_STORE_DWORDX3(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORDX3
+
+    Inst_FLAT__FLAT_STORE_DWORDX3::~Inst_FLAT__FLAT_STORE_DWORDX3()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords.
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
+
+        data0.read();
+        data1.read();
+        data2.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 3 + 2] = data2[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_STORE_DWORDX4 class methods ---
+
+    Inst_FLAT__FLAT_STORE_DWORDX4::Inst_FLAT__FLAT_STORE_DWORDX4(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_FLAT__FLAT_STORE_DWORDX4
+
+    Inst_FLAT__FLAT_STORE_DWORDX4::~Inst_FLAT__FLAT_STORE_DWORDX4()
+    {
+    } // ~Inst_FLAT__FLAT_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords.
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            if (isFlat()) {
+                wf->decLGKMInstsIssued();
+            }
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.DATA + 3);
+
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();
+
+        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
+            }
+        }
+
+        issueRequestHelper(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SWAP class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP::Inst_FLAT__FLAT_ATOMIC_SWAP(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_swap")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SWAP
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP::~Inst_FLAT__FLAT_ATOMIC_SWAP()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+
+    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP
+        ::Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_cmpswap")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0];
+    // cmp = DATA[1];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32, 1>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD::Inst_FLAT__FLAT_ATOMIC_ADD(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD
+
+    Inst_FLAT__FLAT_ATOMIC_ADD::~Inst_FLAT__FLAT_ATOMIC_ADD()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SUB class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SUB::Inst_FLAT__FLAT_ATOMIC_SUB(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_sub")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SUB
+
+    Inst_FLAT__FLAT_ATOMIC_SUB::~Inst_FLAT__FLAT_ATOMIC_SUB()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SUB
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMIN class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMIN
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN::~Inst_FLAT__FLAT_ATOMIC_SMIN()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI32, VecElemI32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI32, VecElemI32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMIN class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN::Inst_FLAT__FLAT_ATOMIC_UMIN(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMIN
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN::~Inst_FLAT__FLAT_ATOMIC_UMIN()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMAX class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX::Inst_FLAT__FLAT_ATOMIC_SMAX(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMAX
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX::~Inst_FLAT__FLAT_ATOMIC_SMAX()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI32, VecElemI32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI32, VecElemI32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMAX class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX::Inst_FLAT__FLAT_ATOMIC_UMAX(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMAX
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX::~Inst_FLAT__FLAT_ATOMIC_UMAX()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_AND class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_AND::Inst_FLAT__FLAT_ATOMIC_AND(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_and")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_AND
+
+    Inst_FLAT__FLAT_ATOMIC_AND::~Inst_FLAT__FLAT_ATOMIC_AND()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_AND
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_OR class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_OR::Inst_FLAT__FLAT_ATOMIC_OR(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_or")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_OR
+
+    Inst_FLAT__FLAT_ATOMIC_OR::~Inst_FLAT__FLAT_ATOMIC_OR()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_OR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+
+    // --- Inst_FLAT__FLAT_ATOMIC_XOR class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_XOR::Inst_FLAT__FLAT_ATOMIC_XOR(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_xor")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_XOR
+
+    Inst_FLAT__FLAT_ATOMIC_XOR::~Inst_FLAT__FLAT_ATOMIC_XOR()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_XOR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_INC class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_INC::Inst_FLAT__FLAT_ATOMIC_INC(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_inc")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_INC
+
+    Inst_FLAT__FLAT_ATOMIC_INC::~Inst_FLAT__FLAT_ATOMIC_INC()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_INC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_DEC class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_dec")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_DEC
+
+    Inst_FLAT__FLAT_ATOMIC_DEC::~Inst_FLAT__FLAT_ATOMIC_DEC()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_DEC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU32, VecElemU32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU32, VecElemU32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SWAP_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_swap_x2")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SWAP_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::~Inst_FLAT__FLAT_ATOMIC_SWAP_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_cmpswap_x2")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
+
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0:1];
+    // cmp = DATA[2:3];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64, 2>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::Inst_FLAT__FLAT_ATOMIC_ADD_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add_x2")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD_X2
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::~Inst_FLAT__FLAT_ATOMIC_ADD_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SUB_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::Inst_FLAT__FLAT_ATOMIC_SUB_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_sub_x2")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SUB_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::~Inst_FLAT__FLAT_ATOMIC_SUB_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SUB_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SUB_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMIN_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMIN_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::~Inst_FLAT__FLAT_ATOMIC_SMIN_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI64, VecElemI64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI64, VecElemI64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMIN_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::Inst_FLAT__FLAT_ATOMIC_UMIN_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMIN_X2
+
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::~Inst_FLAT__FLAT_ATOMIC_UMIN_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_SMAX_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::Inst_FLAT__FLAT_ATOMIC_SMAX_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_smax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_SMAX_X2
+
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::~Inst_FLAT__FLAT_ATOMIC_SMAX_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandI64, VecElemI64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemI64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandI64, VecElemI64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_UMAX_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::Inst_FLAT__FLAT_ATOMIC_UMAX_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_umax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_UMAX_X2
+
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::~Inst_FLAT__FLAT_ATOMIC_UMAX_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_AND_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::Inst_FLAT__FLAT_ATOMIC_AND_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_and_x2")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_AND_X2
+
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::~Inst_FLAT__FLAT_ATOMIC_AND_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_AND_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_AND_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_OR_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::Inst_FLAT__FLAT_ATOMIC_OR_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_or_x2")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_OR_X2
+
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::~Inst_FLAT__FLAT_ATOMIC_OR_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_OR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_OR_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_XOR_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::Inst_FLAT__FLAT_ATOMIC_XOR_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_xor_x2")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_XOR_X2
+
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::~Inst_FLAT__FLAT_ATOMIC_XOR_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_XOR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_XOR_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_INC_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::Inst_FLAT__FLAT_ATOMIC_INC_X2(
+          InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_inc_x2")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_INC_X2
+
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::~Inst_FLAT__FLAT_ATOMIC_INC_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_INC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_INC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_DEC_X2 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_dec_x2")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_DEC_X2
+
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::~Inst_FLAT__FLAT_ATOMIC_DEC_X2()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_DEC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandU64, VecElemU64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_DEC_X2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandU64, VecElemU64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD_F32 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::Inst_FLAT__FLAT_ATOMIC_ADD_F32(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add_f32")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD_F32
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::~Inst_FLAT__FLAT_ATOMIC_ADD_F32()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_F32
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF32, VecElemF32>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F32::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF32, VecElemF32>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_pk_add_f16")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
+
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_ADD_F64 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::Inst_FLAT__FLAT_ATOMIC_ADD_F64(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_add_f64")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_ADD_F64
+
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::~Inst_FLAT__FLAT_ATOMIC_ADD_F64()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_F64
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_ADD_F64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_MIN_F64 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::Inst_FLAT__FLAT_ATOMIC_MIN_F64(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_min_f64")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_MIN_F64
+
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::~Inst_FLAT__FLAT_ATOMIC_MIN_F64()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_MIN_F64
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MIN_F64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
+    } // completeAcc
+    // --- Inst_FLAT__FLAT_ATOMIC_MAX_F64 class methods ---
+
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::Inst_FLAT__FLAT_ATOMIC_MAX_F64(
+        InFmt_FLAT *iFmt)
+        : Inst_FLAT(iFmt, "flat_atomic_max_f64")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+    } // Inst_FLAT__FLAT_ATOMIC_MAX_F64
+
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::~Inst_FLAT__FLAT_ATOMIC_MAX_F64()
+    {
+    } // ~Inst_FLAT__FLAT_ATOMIC_MAX_F64
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        atomicExecute<ConstVecOperandF64, VecElemF64>(gpuDynInst);
+    } // execute
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemF64>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_FLAT__FLAT_ATOMIC_MAX_F64::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
+    } // completeAcc
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/inst_util.hh b/src/arch/amdgpu/vega/insts/inst_util.hh
index 7ec2e2ddd3..ac8c572d77 100644
--- a/src/arch/amdgpu/vega/insts/inst_util.hh
+++ b/src/arch/amdgpu/vega/insts/inst_util.hh
@@ -35,6 +35,7 @@
 #include <cmath>
 
 #include "arch/amdgpu/vega/gpu_registers.hh"
+#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
 
 namespace gem5
 {
@@ -315,7 +316,8 @@ namespace VegaISA
      * 0x142: broadcast 15th thread of each row to next row
      * 0x143: broadcast thread 31 to rows 2 and 3
      */
-    int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
+    inline int
+    dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
                     int rowOffset, bool & outOfBounds)
     {
         // local variables
@@ -699,7 +701,7 @@ namespace VegaISA
         if (sel < SDWA_WORD_0) { // we are selecting 1 byte
             // if we sign extended depends on upper-most bit of byte 0
             signExt = (signExt &&
-                       (bits(currDstVal, VegaISA::MSB_PER_WORD, 0) & 0x80));
+                       (bits(currDstVal, VegaISA::MSB_PER_BYTE, 0) & 0x80));
 
             for (int byte = 0; byte < 4; ++byte) {
                 low_bit = byte * VegaISA::BITS_PER_BYTE;
@@ -712,7 +714,7 @@ namespace VegaISA
                     3.  byte > sel && signExt: we're sign extending and
                     this byte is one of the bytes we need to sign extend
                 */
-                origBits_thisByte = bits(origDstVal, high_bit, low_bit);
+                origBits_thisByte = bits(origDstVal, VegaISA::MSB_PER_BYTE, 0);
                 currBits_thisByte = bits(currDstVal, high_bit, low_bit);
                 newBits = ((byte == sel) ? origBits_thisByte :
                            ((preserve) ? currBits_thisByte :
@@ -737,7 +739,7 @@ namespace VegaISA
                     3.  word > (sel & 1) && signExt: we're sign extending and
                     this word is one of the words we need to sign extend
                 */
-                origBits_thisWord = bits(origDstVal, high_bit, low_bit);
+                origBits_thisWord = bits(origDstVal, VegaISA::MSB_PER_WORD, 0);
                 currBits_thisWord = bits(currDstVal, high_bit, low_bit);
                 newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
                            ((preserve) ? currBits_thisWord :
diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc
deleted file mode 100644
index cd4ad74e6e..0000000000
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ /dev/null
@@ -1,45912 +0,0 @@
-/*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "arch/amdgpu/vega/insts/instructions.hh"
-
-#include <cmath>
-
-#include "arch/amdgpu/vega/insts/inst_util.hh"
-#include "debug/VEGA.hh"
-#include "debug/GPUSync.hh"
-#include "dev/amdgpu/hwreg_defines.hh"
-#include "gpu-compute/shader.hh"
-
-namespace gem5
-{
-
-namespace VegaISA
-{
-    // --- Inst_SOP2__S_ADD_U32 class methods ---
-
-    Inst_SOP2__S_ADD_U32::Inst_SOP2__S_ADD_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_add_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADD_U32
-
-    Inst_SOP2__S_ADD_U32::~Inst_SOP2__S_ADD_U32()
-    {
-    } // ~Inst_SOP2__S_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    // SCC = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an unsigned
-    // ---  overflow/carry-out for S_ADDC_U32.
-    void
-    Inst_SOP2__S_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() + src1.rawData();
-        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData())
-            >= 0x100000000ULL ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_SUB_U32 class methods ---
-
-    Inst_SOP2__S_SUB_U32::Inst_SOP2__S_SUB_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_sub_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUB_U32
-
-    Inst_SOP2__S_SUB_U32::~Inst_SOP2__S_SUB_U32()
-    {
-    } // ~Inst_SOP2__S_SUB_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    // SCC = (S1.u > S0.u ? 1 : 0) is an unsigned overflow or carry-out for
-    // ---  S_SUBB_U32.
-    void
-    Inst_SOP2__S_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() - src1.rawData();
-        scc = (src1.rawData() > src0.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ADD_I32 class methods ---
-
-    Inst_SOP2__S_ADD_I32::Inst_SOP2__S_ADD_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_add_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADD_I32
-
-    Inst_SOP2__S_ADD_I32::~Inst_SOP2__S_ADD_I32()
-    {
-    } // ~Inst_SOP2__S_ADD_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i + S1.i;
-    // SCC = (S0.u[31] == S1.u[31] && S0.u[31] != D.u[31]) is a signed
-    // overflow.
-    // This opcode is not suitable for use with S_ADDC_U32 for implementing
-    // 64-bit operations.
-    void
-    Inst_SOP2__S_ADD_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() + src1.rawData();
-        scc = (bits(src0.rawData(), 31) == bits(src1.rawData(), 31)
-            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31))
-            ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_SUB_I32 class methods ---
-
-    Inst_SOP2__S_SUB_I32::Inst_SOP2__S_SUB_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_sub_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUB_I32
-
-    Inst_SOP2__S_SUB_I32::~Inst_SOP2__S_SUB_I32()
-    {
-    } // ~Inst_SOP2__S_SUB_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i - S1.i;
-    // SCC = (S0.u[31] != S1.u[31] && S0.u[31] != D.u[31]) is a signed
-    // overflow.
-    // CAUTION: The condition code behaviour for this opcode is inconsistent
-    // with V_SUB_I32; see V_SUB_I32 for further details.
-    // This opcode is not suitable for use with S_SUBB_U32 for implementing
-    // 64-bit operations.
-    void
-    Inst_SOP2__S_SUB_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() - src1.rawData();
-        scc = (bits(src0.rawData(), 31) != bits(src1.rawData(), 31)
-            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ADDC_U32 class methods ---
-
-    Inst_SOP2__S_ADDC_U32::Inst_SOP2__S_ADDC_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_addc_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ADDC_U32
-
-    Inst_SOP2__S_ADDC_U32::~Inst_SOP2__S_ADDC_U32()
-    {
-    } // ~Inst_SOP2__S_ADDC_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + SCC;
-    // SCC = (S0.u + S1.u + SCC >= 0x800000000ULL ? 1 : 0) is an unsigned
-    // overflow.
-    void
-    Inst_SOP2__S_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = src0.rawData() + src1.rawData() + scc.rawData();
-        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData()
-            + (ScalarRegU64)scc.rawData()) >= 0x100000000ULL ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_SUBB_U32 class methods ---
-
-    Inst_SOP2__S_SUBB_U32::Inst_SOP2__S_SUBB_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_subb_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_SUBB_U32
-
-    Inst_SOP2__S_SUBB_U32::~Inst_SOP2__S_SUBB_U32()
-    {
-    } // ~Inst_SOP2__S_SUBB_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u - SCC;
-    // SCC = (S1.u + SCC > S0.u ? 1 : 0) is an unsigned overflow.
-    void
-    Inst_SOP2__S_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = src0.rawData() - src1.rawData() - scc.rawData();
-        scc = (src1.rawData() + scc.rawData()) > src0.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MIN_I32 class methods ---
-
-    Inst_SOP2__S_MIN_I32::Inst_SOP2__S_MIN_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_min_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MIN_I32
-
-    Inst_SOP2__S_MIN_I32::~Inst_SOP2__S_MIN_I32()
-    {
-    } // ~Inst_SOP2__S_MIN_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i < S1.i) ? S0.i : S1.i;
-    // SCC = 1 if S0 is chosen as the minimum value.
-    void
-    Inst_SOP2__S_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::min(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MIN_U32 class methods ---
-
-    Inst_SOP2__S_MIN_U32::Inst_SOP2__S_MIN_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_min_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MIN_U32
-
-    Inst_SOP2__S_MIN_U32::~Inst_SOP2__S_MIN_U32()
-    {
-    } // ~Inst_SOP2__S_MIN_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u < S1.u) ? S0.u : S1.u;
-    // SCC = 1 if S0 is chosen as the minimum value.
-    void
-    Inst_SOP2__S_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::min(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MAX_I32 class methods ---
-
-    Inst_SOP2__S_MAX_I32::Inst_SOP2__S_MAX_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_max_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MAX_I32
-
-    Inst_SOP2__S_MAX_I32::~Inst_SOP2__S_MAX_I32()
-    {
-    } // ~Inst_SOP2__S_MAX_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i > S1.i) ? S0.i : S1.i;
-    // SCC = 1 if S0 is chosen as the maximum value.
-    void
-    Inst_SOP2__S_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::max(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_MAX_U32 class methods ---
-
-    Inst_SOP2__S_MAX_U32::Inst_SOP2__S_MAX_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_max_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MAX_U32
-
-    Inst_SOP2__S_MAX_U32::~Inst_SOP2__S_MAX_U32()
-    {
-    } // ~Inst_SOP2__S_MAX_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u > S1.u) ? S0.u : S1.u;
-    // SCC = 1 if S0 is chosen as the maximum value.
-    void
-    Inst_SOP2__S_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = std::max(src0.rawData(), src1.rawData());
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_CSELECT_B32 class methods ---
-
-    Inst_SOP2__S_CSELECT_B32::Inst_SOP2__S_CSELECT_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cselect_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_CSELECT_B32
-
-    Inst_SOP2__S_CSELECT_B32::~Inst_SOP2__S_CSELECT_B32()
-    {
-    } // ~Inst_SOP2__S_CSELECT_B32
-
-    // --- description from .arch file ---
-    // D.u = SCC ? S0.u : S1.u (conditional select).
-    void
-    Inst_SOP2__S_CSELECT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_CSELECT_B64 class methods ---
-
-    Inst_SOP2__S_CSELECT_B64::Inst_SOP2__S_CSELECT_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cselect_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_CSELECT_B64
-
-    Inst_SOP2__S_CSELECT_B64::~Inst_SOP2__S_CSELECT_B64()
-    {
-    } // ~Inst_SOP2__S_CSELECT_B64
-
-    // --- description from .arch file ---
-    // D.u64 = SCC ? S0.u64 : S1.u64 (conditional select).
-    void
-    Inst_SOP2__S_CSELECT_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-        scc.read();
-
-        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_AND_B32 class methods ---
-
-    Inst_SOP2__S_AND_B32::Inst_SOP2__S_AND_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_and_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_AND_B32
-
-    Inst_SOP2__S_AND_B32::~Inst_SOP2__S_AND_B32()
-    {
-    } // ~Inst_SOP2__S_AND_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() & src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_AND_B64 class methods ---
-
-    Inst_SOP2__S_AND_B64::Inst_SOP2__S_AND_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_and_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_AND_B64
-
-    Inst_SOP2__S_AND_B64::~Inst_SOP2__S_AND_B64()
-    {
-    } // ~Inst_SOP2__S_AND_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 & S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_AND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() & src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_OR_B32 class methods ---
-
-    Inst_SOP2__S_OR_B32::Inst_SOP2__S_OR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_or_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_OR_B32
-
-    Inst_SOP2__S_OR_B32::~Inst_SOP2__S_OR_B32()
-    {
-    } // ~Inst_SOP2__S_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() | src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_OR_B64 class methods ---
-
-    Inst_SOP2__S_OR_B64::Inst_SOP2__S_OR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_or_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_OR_B64
-
-    Inst_SOP2__S_OR_B64::~Inst_SOP2__S_OR_B64()
-    {
-    } // ~Inst_SOP2__S_OR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 | S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_OR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() | src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XOR_B32 class methods ---
-
-    Inst_SOP2__S_XOR_B32::Inst_SOP2__S_XOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XOR_B32
-
-    Inst_SOP2__S_XOR_B32::~Inst_SOP2__S_XOR_B32()
-    {
-    } // ~Inst_SOP2__S_XOR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u ^ S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() ^ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XOR_B64 class methods ---
-
-    Inst_SOP2__S_XOR_B64::Inst_SOP2__S_XOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XOR_B64
-
-    Inst_SOP2__S_XOR_B64::~Inst_SOP2__S_XOR_B64()
-    {
-    } // ~Inst_SOP2__S_XOR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 ^ S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() ^ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ANDN2_B32 class methods ---
-
-    Inst_SOP2__S_ANDN2_B32::Inst_SOP2__S_ANDN2_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_andn2_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ANDN2_B32
-
-    Inst_SOP2__S_ANDN2_B32::~Inst_SOP2__S_ANDN2_B32()
-    {
-    } // ~Inst_SOP2__S_ANDN2_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & ~S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ANDN2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() &~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ANDN2_B64 class methods ---
-
-    Inst_SOP2__S_ANDN2_B64::Inst_SOP2__S_ANDN2_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_andn2_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ANDN2_B64
-
-    Inst_SOP2__S_ANDN2_B64::~Inst_SOP2__S_ANDN2_B64()
-    {
-    } // ~Inst_SOP2__S_ANDN2_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 & ~S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ANDN2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() &~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ORN2_B32 class methods ---
-
-    Inst_SOP2__S_ORN2_B32::Inst_SOP2__S_ORN2_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_orn2_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ORN2_B32
-
-    Inst_SOP2__S_ORN2_B32::~Inst_SOP2__S_ORN2_B32()
-    {
-    } // ~Inst_SOP2__S_ORN2_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | ~S1.u;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ORN2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() |~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ORN2_B64 class methods ---
-
-    Inst_SOP2__S_ORN2_B64::Inst_SOP2__S_ORN2_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_orn2_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ORN2_B64
-
-    Inst_SOP2__S_ORN2_B64::~Inst_SOP2__S_ORN2_B64()
-    {
-    } // ~Inst_SOP2__S_ORN2_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 | ~S1.u64;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_ORN2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() |~ src1.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NAND_B32 class methods ---
-
-    Inst_SOP2__S_NAND_B32::Inst_SOP2__S_NAND_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nand_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NAND_B32
-
-    Inst_SOP2__S_NAND_B32::~Inst_SOP2__S_NAND_B32()
-    {
-    } // ~Inst_SOP2__S_NAND_B32
-
-    // --- description from .arch file ---
-    // D.u = ~(S0.u & S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NAND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() & src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NAND_B64 class methods ---
-
-    Inst_SOP2__S_NAND_B64::Inst_SOP2__S_NAND_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nand_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NAND_B64
-
-    Inst_SOP2__S_NAND_B64::~Inst_SOP2__S_NAND_B64()
-    {
-    } // ~Inst_SOP2__S_NAND_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~(S0.u64 & S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NAND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() & src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NOR_B32 class methods ---
-
-    Inst_SOP2__S_NOR_B32::Inst_SOP2__S_NOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NOR_B32
-
-    Inst_SOP2__S_NOR_B32::~Inst_SOP2__S_NOR_B32()
-    {
-    } // ~Inst_SOP2__S_NOR_B32
-
-    // --- description from .arch file ---
-    // D.u = ~(S0.u | S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() | src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_NOR_B64 class methods ---
-
-    Inst_SOP2__S_NOR_B64::Inst_SOP2__S_NOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_nor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_NOR_B64
-
-    Inst_SOP2__S_NOR_B64::~Inst_SOP2__S_NOR_B64()
-    {
-    } // ~Inst_SOP2__S_NOR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~(S0.u64 | S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_NOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() | src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XNOR_B32 class methods ---
-
-    Inst_SOP2__S_XNOR_B32::Inst_SOP2__S_XNOR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xnor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XNOR_B32
-
-    Inst_SOP2__S_XNOR_B32::~Inst_SOP2__S_XNOR_B32()
-    {
-    } // ~Inst_SOP2__S_XNOR_B32
-
-    // --- description from .arch file ---
-    // D.u = ~(S0.u ^ S1.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() ^ src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_XNOR_B64 class methods ---
-
-    Inst_SOP2__S_XNOR_B64::Inst_SOP2__S_XNOR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_xnor_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_XNOR_B64
-
-    Inst_SOP2__S_XNOR_B64::~Inst_SOP2__S_XNOR_B64()
-    {
-    } // ~Inst_SOP2__S_XNOR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~(S0.u64 ^ S1.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_XNOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = ~(src0.rawData() ^ src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHL_B32 class methods ---
-
-    Inst_SOP2__S_LSHL_B32::Inst_SOP2__S_LSHL_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshl_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHL_B32
-
-    Inst_SOP2__S_LSHL_B32::~Inst_SOP2__S_LSHL_B32()
-    {
-    } // ~Inst_SOP2__S_LSHL_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u << S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_LSHL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() << bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHL_B64 class methods ---
-
-    Inst_SOP2__S_LSHL_B64::Inst_SOP2__S_LSHL_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshl_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHL_B64
-
-    Inst_SOP2__S_LSHL_B64::~Inst_SOP2__S_LSHL_B64()
-    {
-    } // ~Inst_SOP2__S_LSHL_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 << S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_LSHL_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() << bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHR_B32 class methods ---
-
-    Inst_SOP2__S_LSHR_B32::Inst_SOP2__S_LSHR_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshr_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHR_B32
-
-    Inst_SOP2__S_LSHR_B32::~Inst_SOP2__S_LSHR_B32()
-    {
-    } // ~Inst_SOP2__S_LSHR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u >> S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to zero.
-    void
-    Inst_SOP2__S_LSHR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_LSHR_B64 class methods ---
-
-    Inst_SOP2__S_LSHR_B64::Inst_SOP2__S_LSHR_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_lshr_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_LSHR_B64
-
-    Inst_SOP2__S_LSHR_B64::~Inst_SOP2__S_LSHR_B64()
-    {
-    } // ~Inst_SOP2__S_LSHR_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64 >> S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to zero.
-    void
-    Inst_SOP2__S_LSHR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ASHR_I32 class methods ---
-
-    Inst_SOP2__S_ASHR_I32::Inst_SOP2__S_ASHR_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_ashr_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ASHR_I32
-
-    Inst_SOP2__S_ASHR_I32::~Inst_SOP2__S_ASHR_I32()
-    {
-    } // ~Inst_SOP2__S_ASHR_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(S0.i) >> S1.u[4:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_SOP2__S_ASHR_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_ASHR_I64 class methods ---
-
-    Inst_SOP2__S_ASHR_I64::Inst_SOP2__S_ASHR_I64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_ashr_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ASHR_I64
-
-    Inst_SOP2__S_ASHR_I64::~Inst_SOP2__S_ASHR_I64()
-    {
-    } // ~Inst_SOP2__S_ASHR_I64
-
-    // --- description from .arch file ---
-    // D.i64 = signext(S0.i64) >> S1.u[5:0];
-    // SCC = 1 if result is non-zero.
-    // The vacated bits are set to the sign bit of the input value.
-    void
-    Inst_SOP2__S_ASHR_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFM_B32 class methods ---
-
-    Inst_SOP2__S_BFM_B32::Inst_SOP2__S_BFM_B32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfm_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFM_B32
-
-    Inst_SOP2__S_BFM_B32::~Inst_SOP2__S_BFM_B32()
-    {
-    } // ~Inst_SOP2__S_BFM_B32
-
-    // --- description from .arch file ---
-    // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0] (bitfield mask).
-    void
-    Inst_SOP2__S_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = ((1 << bits(src0.rawData(), 4, 0)) - 1)
-            << bits(src1.rawData(), 4, 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_BFM_B64 class methods ---
-
-    Inst_SOP2__S_BFM_B64::Inst_SOP2__S_BFM_B64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfm_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFM_B64
-
-    Inst_SOP2__S_BFM_B64::~Inst_SOP2__S_BFM_B64()
-    {
-    } // ~Inst_SOP2__S_BFM_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ((1ULL << S0.u[5:0]) - 1) << S1.u[5:0] (bitfield mask).
-    void
-    Inst_SOP2__S_BFM_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = ((1ULL << bits(src0.rawData(), 5, 0)) - 1)
-            << bits(src1.rawData(), 5, 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_MUL_I32 class methods ---
-
-    Inst_SOP2__S_MUL_I32::Inst_SOP2__S_MUL_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_mul_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MUL_I32
-
-    Inst_SOP2__S_MUL_I32::~Inst_SOP2__S_MUL_I32()
-    {
-    } // ~Inst_SOP2__S_MUL_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i * S1.i.
-    void
-    Inst_SOP2__S_MUL_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        sdst = src0.rawData() * src1.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_U32 class methods ---
-
-    Inst_SOP2__S_BFE_U32::Inst_SOP2__S_BFE_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_U32
-
-    Inst_SOP2__S_BFE_U32::~Inst_SOP2__S_BFE_U32()
-    {
-    } // ~Inst_SOP2__S_BFE_U32
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
-    // field width.
-    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_I32 class methods ---
-
-    Inst_SOP2__S_BFE_I32::Inst_SOP2__S_BFE_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_I32
-
-    Inst_SOP2__S_BFE_I32::~Inst_SOP2__S_BFE_I32()
-    {
-    } // ~Inst_SOP2__S_BFE_I32
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
-    // field width.
-    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
-    // Sign-extend the result;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-
-        // Above extracted a signed int of size src1[22:16] bits which needs
-        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
-        // integer is 1, and sign extend it is.
-        //
-        // Note: The description in the Vega ISA manual does not mention to
-        // sign-extend the result. An update description can be found in the
-        // more recent RDNA3 manual here:
-        // https://developer.amd.com/wp-content/resources/
-        //      RDNA3_Shader_ISA_December2022.pdf
-        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
-            sdst = sdst.rawData()
-                 | (0xffffffff << bits(src1.rawData(), 22, 16));
-        }
-
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_U64 class methods ---
-
-    Inst_SOP2__S_BFE_U64::Inst_SOP2__S_BFE_U64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_U64
-
-    Inst_SOP2__S_BFE_U64::~Inst_SOP2__S_BFE_U64()
-    {
-    } // ~Inst_SOP2__S_BFE_U64
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
-    // field width.
-    // D.u64 = (S0.u64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_BFE_I64 class methods ---
-
-    Inst_SOP2__S_BFE_I64::Inst_SOP2__S_BFE_I64(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_bfe_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_BFE_I64
-
-    Inst_SOP2__S_BFE_I64::~Inst_SOP2__S_BFE_I64()
-    {
-    } // ~Inst_SOP2__S_BFE_I64
-
-    // --- description from .arch file ---
-    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
-    // field width.
-    // D.i64 = (S0.i64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
-    // Sign-extend result;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP2__S_BFE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
-            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
-
-        // Above extracted a signed int of size src1[22:16] bits which needs
-        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
-        // integer is 1, and sign extend it is.
-        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
-            sdst = sdst.rawData()
-                 | 0xffffffffffffffff << bits(src1.rawData(), 22, 16);
-        }
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_CBRANCH_G_FORK class methods ---
-
-    Inst_SOP2__S_CBRANCH_G_FORK::Inst_SOP2__S_CBRANCH_G_FORK(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_cbranch_g_fork")
-    {
-        setFlag(Branch);
-    } // Inst_SOP2__S_CBRANCH_G_FORK
-
-    Inst_SOP2__S_CBRANCH_G_FORK::~Inst_SOP2__S_CBRANCH_G_FORK()
-    {
-    } // ~Inst_SOP2__S_CBRANCH_G_FORK
-
-    // --- description from .arch file ---
-    // mask_pass = S0.u64 & EXEC;
-    // mask_fail = ~S0.u64 & EXEC;
-    // if(mask_pass == EXEC)
-    //     PC = S1.u64;
-    // elsif(mask_fail == EXEC)
-    //     PC += 4;
-    // elsif(bitcount(mask_fail) < bitcount(mask_pass))
-    //     EXEC = mask_fail;
-    //     SGPR[CSP*4] = { S1.u64, mask_pass };
-    //     CSP++;
-    //     PC += 4;
-    // else
-    //     EXEC = mask_pass;
-    //     SGPR[CSP*4] = { PC + 4, mask_fail };
-    //     CSP++;
-    //     PC = S1.u64;
-    // end.
-    // Conditional branch using branch-stack.
-    // S0 = compare mask(vcc or any sgpr) and
-    // S1 = 64-bit byte address of target instruction.
-    // See also S_CBRANCH_JOIN.
-    void
-    Inst_SOP2__S_CBRANCH_G_FORK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP2__S_ABSDIFF_I32 class methods ---
-
-    Inst_SOP2__S_ABSDIFF_I32::Inst_SOP2__S_ABSDIFF_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_absdiff_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_ABSDIFF_I32
-
-    Inst_SOP2__S_ABSDIFF_I32::~Inst_SOP2__S_ABSDIFF_I32()
-    {
-    } // ~Inst_SOP2__S_ABSDIFF_I32
-
-    // --- description from .arch file ---
-    // D.i = S0.i - S1.i;
-    // if(D.i < 0) then D.i = -D.i;
-    // SCC = 1 if result is non-zero.
-    // Compute the absolute value of difference between two values.
-    void
-    Inst_SOP2__S_ABSDIFF_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        sdst = std::abs(src0.rawData() - src1.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP2__S_RFE_RESTORE_B64 class methods ---
-
-    Inst_SOP2__S_RFE_RESTORE_B64::Inst_SOP2__S_RFE_RESTORE_B64(
-          InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_rfe_restore_b64")
-    {
-    } // Inst_SOP2__S_RFE_RESTORE_B64
-
-    Inst_SOP2__S_RFE_RESTORE_B64::~Inst_SOP2__S_RFE_RESTORE_B64()
-    {
-    } // ~Inst_SOP2__S_RFE_RESTORE_B64
-
-    // --- description from .arch file ---
-    // PRIV = 0;
-    // PC = S0.u64;
-    // INST_ATC = S1.u32[0].
-    // Return from exception handler and continue, possibly changing the
-    // ---  instruction ATC mode.
-    // This instruction may only be used within a trap handler.
-    // Use this instruction when the main program may be in a different memory
-    // ---  space than the trap handler.
-    void
-    Inst_SOP2__S_RFE_RESTORE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP2__S_MUL_HI_U32 class methods ---
-
-    Inst_SOP2__S_MUL_HI_U32::Inst_SOP2__S_MUL_HI_U32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_mul_hi_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MUL_HI_U32
-
-    Inst_SOP2__S_MUL_HI_U32::~Inst_SOP2__S_MUL_HI_U32()
-    {
-    } // ~Inst_SOP2__S_MUL_HI_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u * S1.u) >> 32;
-    void
-    Inst_SOP2__S_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        VecElemU64 tmp_dst =
-            ((VecElemU64)src0.rawData() * (VecElemU64)src1.rawData());
-        sdst = (tmp_dst >> 32);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP2__S_MUL_HI_I32 class methods ---
-
-    Inst_SOP2__S_MUL_HI_I32::Inst_SOP2__S_MUL_HI_I32(InFmt_SOP2 *iFmt)
-        : Inst_SOP2(iFmt, "s_mul_hi_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP2__S_MUL_HI_I32
-
-    Inst_SOP2__S_MUL_HI_I32::~Inst_SOP2__S_MUL_HI_I32()
-    {
-    } // ~Inst_SOP2__S_MUL_HI_I32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u * S1.u) >> 32;
-    void
-    Inst_SOP2__S_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src0.read();
-        src1.read();
-
-        VecElemI64 tmp_src0 =
-            sext<std::numeric_limits<VecElemI64>::digits>(src0.rawData());
-        VecElemI64 tmp_src1 =
-            sext<std::numeric_limits<VecElemI64>::digits>(src1.rawData());
-        sdst = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_MOVK_I32 class methods ---
-
-    Inst_SOPK__S_MOVK_I32::Inst_SOPK__S_MOVK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_movk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_MOVK_I32
-
-    Inst_SOPK__S_MOVK_I32::~Inst_SOPK__S_MOVK_I32()
-    {
-    } // ~Inst_SOPK__S_MOVK_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(SIMM16) (sign extension).
-    void
-    Inst_SOPK__S_MOVK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        sdst = simm16;
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_CMOVK_I32 class methods ---
-
-    Inst_SOPK__S_CMOVK_I32::Inst_SOPK__S_CMOVK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmovk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMOVK_I32
-
-    Inst_SOPK__S_CMOVK_I32::~Inst_SOPK__S_CMOVK_I32()
-    {
-    } // ~Inst_SOPK__S_CMOVK_I32
-
-    // --- description from .arch file ---
-    // if(SCC) then D.i = signext(SIMM16);
-    // else NOP.
-    // Conditional move with sign extension.
-    void
-    Inst_SOPK__S_CMOVK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = simm16;
-            sdst.write();
-        }
-    } // execute
-    // --- Inst_SOPK__S_CMPK_EQ_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_EQ_I32::Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_EQ_I32
-
-    Inst_SOPK__S_CMPK_EQ_I32::~Inst_SOPK__S_CMPK_EQ_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_EQ_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i == signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() == simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LG_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_LG_I32::Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lg_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LG_I32
-
-    Inst_SOPK__S_CMPK_LG_I32::~Inst_SOPK__S_CMPK_LG_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LG_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i != signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LG_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() != simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GT_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_GT_I32::Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GT_I32
-
-    Inst_SOPK__S_CMPK_GT_I32::~Inst_SOPK__S_CMPK_GT_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i > signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() > simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GE_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_GE_I32::Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GE_I32
-
-    Inst_SOPK__S_CMPK_GE_I32::~Inst_SOPK__S_CMPK_GE_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i >= signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() >= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LT_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_LT_I32::Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LT_I32
-
-    Inst_SOPK__S_CMPK_LT_I32::~Inst_SOPK__S_CMPK_LT_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i < signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() < simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LE_I32 class methods ---
-
-    Inst_SOPK__S_CMPK_LE_I32::Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LE_I32
-
-    Inst_SOPK__S_CMPK_LE_I32::~Inst_SOPK__S_CMPK_LE_I32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i <= signext(SIMM16)).
-    void
-    Inst_SOPK__S_CMPK_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() <= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_EQ_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_EQ_U32::Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_EQ_U32
-
-    Inst_SOPK__S_CMPK_EQ_U32::~Inst_SOPK__S_CMPK_EQ_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_EQ_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u == SIMM16).
-    void
-    Inst_SOPK__S_CMPK_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() == simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LG_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_LG_U32::Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lg_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LG_U32
-
-    Inst_SOPK__S_CMPK_LG_U32::~Inst_SOPK__S_CMPK_LG_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LG_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u != SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LG_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() != simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GT_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_GT_U32::Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GT_U32
-
-    Inst_SOPK__S_CMPK_GT_U32::~Inst_SOPK__S_CMPK_GT_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u > SIMM16).
-    void
-    Inst_SOPK__S_CMPK_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() > simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_GE_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_GE_U32::Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_GE_U32
-
-    Inst_SOPK__S_CMPK_GE_U32::~Inst_SOPK__S_CMPK_GE_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_GE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u >= SIMM16).
-    void
-    Inst_SOPK__S_CMPK_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() >= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LT_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_LT_U32::Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LT_U32
-
-    Inst_SOPK__S_CMPK_LT_U32::~Inst_SOPK__S_CMPK_LT_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u < SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() < simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_CMPK_LE_U32 class methods ---
-
-    Inst_SOPK__S_CMPK_LE_U32::Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cmpk_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_CMPK_LE_U32
-
-    Inst_SOPK__S_CMPK_LE_U32::~Inst_SOPK__S_CMPK_LE_U32()
-    {
-    } // ~Inst_SOPK__S_CMPK_LE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u <= SIMM16).
-    void
-    Inst_SOPK__S_CMPK_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
-        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        scc = (src.rawData() <= simm16) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_ADDK_I32 class methods ---
-
-    Inst_SOPK__S_ADDK_I32::Inst_SOPK__S_ADDK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_addk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_ADDK_I32
-
-    Inst_SOPK__S_ADDK_I32::~Inst_SOPK__S_ADDK_I32()
-    {
-    } // ~Inst_SOPK__S_ADDK_I32
-
-    // --- description from .arch file ---
-    // D.i = D.i + signext(SIMM16);
-    // SCC = overflow.
-    void
-    Inst_SOPK__S_ADDK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = src.rawData() + (ScalarRegI32)sext<16>(simm16);
-        scc = (bits(src.rawData(), 31) == bits(simm16, 15)
-            && bits(src.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOPK__S_MULK_I32 class methods ---
-
-    Inst_SOPK__S_MULK_I32::Inst_SOPK__S_MULK_I32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_mulk_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_MULK_I32
-
-    Inst_SOPK__S_MULK_I32::~Inst_SOPK__S_MULK_I32()
-    {
-    } // ~Inst_SOPK__S_MULK_I32
-
-    // --- description from .arch file ---
-    // D.i = D.i * signext(SIMM16).
-    void
-    Inst_SOPK__S_MULK_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData() * (ScalarRegI32)sext<16>(simm16);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_CBRANCH_I_FORK class methods ---
-
-    Inst_SOPK__S_CBRANCH_I_FORK::Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_cbranch_i_fork")
-    {
-        setFlag(Branch);
-    } // Inst_SOPK__S_CBRANCH_I_FORK
-
-    Inst_SOPK__S_CBRANCH_I_FORK::~Inst_SOPK__S_CBRANCH_I_FORK()
-    {
-    } // ~Inst_SOPK__S_CBRANCH_I_FORK
-
-    // --- description from .arch file ---
-    // mask_pass = S0.u64 & EXEC;
-    // mask_fail = ~S0.u64 & EXEC;
-    // target_addr = PC + signext(SIMM16 * 4) + 4;
-    // if(mask_pass == EXEC)
-    //     PC = target_addr;
-    // elsif(mask_fail == EXEC)
-    //     PC += 4;
-    // elsif(bitcount(mask_fail) < bitcount(mask_pass))
-    //     EXEC = mask_fail;
-    //     SGPR[CSP*4] = { target_addr, mask_pass };
-    //     CSP++;
-    //     PC += 4;
-    // else
-    //     EXEC = mask_pass;
-    //     SGPR[CSP*4] = { PC + 4, mask_fail };
-    //     CSP++;
-    //     PC = target_addr;
-    // end.
-    // Conditional branch using branch-stack.
-    // S0 = compare mask(vcc or any sgpr), and
-    // SIMM16 = signed DWORD branch offset relative to next instruction.
-    // See also S_CBRANCH_JOIN.
-    void
-    Inst_SOPK__S_CBRANCH_I_FORK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPK__S_GETREG_B32 class methods ---
-
-    Inst_SOPK__S_GETREG_B32::Inst_SOPK__S_GETREG_B32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_getreg_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_GETREG_B32
-
-    Inst_SOPK__S_GETREG_B32::~Inst_SOPK__S_GETREG_B32()
-    {
-    } // ~Inst_SOPK__S_GETREG_B32
-
-    // --- description from .arch file ---
-    // D.u = hardware-reg. Read some or all of a hardware register into the
-    // LSBs of D.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_GETREG_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarRegU32 hwregId = simm16 & 0x3f;
-        ScalarRegU32 offset = (simm16 >> 6) & 31;
-        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
-
-        ScalarRegU32 hwreg =
-            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        sdst.read();
-
-        // Store value from hardware to part of the SDST.
-        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
-        sdst = (hwreg & mask) >> offset;
-        sdst.write();
-    } // execute
-    // --- Inst_SOPK__S_SETREG_B32 class methods ---
-
-    Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_setreg_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_SETREG_B32
-
-    Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
-    {
-    } // ~Inst_SOPK__S_SETREG_B32
-
-    // --- description from .arch file ---
-    // hardware-reg = S0.u. Write some or all of the LSBs of D into a hardware
-    // register.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarRegU32 hwregId = simm16 & 0x3f;
-        ScalarRegU32 offset = (simm16 >> 6) & 31;
-        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
-
-        ScalarRegU32 hwreg =
-            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        sdst.read();
-
-        // Store value from SDST to part of the hardware register.
-        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
-        hwreg = ((hwreg & ~mask) | ((sdst.rawData() << offset) & mask));
-        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
-
-        // set MODE register to control the behavior of single precision
-        // floating-point numbers: denormal mode or round mode
-        if (hwregId==1 && size==2
-                        && (offset==4 || offset==0)) {
-            warn_once("Be cautious that s_setreg_b32 has no real effect "
-                            "on FP modes: %s\n", gpuDynInst->disassemble());
-            return;
-        }
-
-        // panic if not changing MODE of floating-point numbers
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPK__S_SETREG_IMM32_B32 class methods ---
-
-    Inst_SOPK__S_SETREG_IMM32_B32::Inst_SOPK__S_SETREG_IMM32_B32(
-          InFmt_SOPK *iFmt)
-        : Inst_SOPK(iFmt, "s_setreg_imm32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPK__S_SETREG_IMM32_B32
-
-    Inst_SOPK__S_SETREG_IMM32_B32::~Inst_SOPK__S_SETREG_IMM32_B32()
-    {
-    } // ~Inst_SOPK__S_SETREG_IMM32_B32
-
-    // --- description from .arch file ---
-    // Write some or all of the LSBs of IMM32 into a hardware register; this
-    // ---  instruction requires a 32-bit literal constant.
-    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
-    // is 1..32.
-    void
-    Inst_SOPK__S_SETREG_IMM32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ScalarRegU32 hwregId = simm16 & 0x3f;
-        ScalarRegU32 offset = (simm16 >> 6) & 31;
-        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
-
-        ScalarRegU32 hwreg =
-            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
-        ScalarRegI32 simm32 = extData.imm_u32;
-
-        // Store value from SIMM32 to part of the hardware register.
-        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
-        hwreg = ((hwreg & ~mask) | ((simm32 << offset) & mask));
-        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
-
-        // set MODE register to control the behavior of single precision
-        // floating-point numbers: denormal mode or round mode
-        if (hwregId==HW_REG_MODE && size==2
-                        && (offset==4 || offset==0)) {
-            warn_once("Be cautious that s_setreg_imm32_b32 has no real effect "
-                            "on FP modes: %s\n", gpuDynInst->disassemble());
-            return;
-        }
-
-        // panic if not changing modes of single-precision FPs
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_MOV_B32 class methods ---
-
-    Inst_SOP1__S_MOV_B32::Inst_SOP1__S_MOV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_B32
-
-    Inst_SOP1__S_MOV_B32::~Inst_SOP1__S_MOV_B32()
-    {
-    } // ~Inst_SOP1__S_MOV_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u.
-    void
-    Inst_SOP1__S_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOV_B64 class methods ---
-
-    Inst_SOP1__S_MOV_B64::Inst_SOP1__S_MOV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_B64
-
-    Inst_SOP1__S_MOV_B64::~Inst_SOP1__S_MOV_B64()
-    {
-    } // ~Inst_SOP1__S_MOV_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S0.u64.
-    void
-    Inst_SOP1__S_MOV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_CMOV_B32 class methods ---
-
-    Inst_SOP1__S_CMOV_B32::Inst_SOP1__S_CMOV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cmov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_CMOV_B32
-
-    Inst_SOP1__S_CMOV_B32::~Inst_SOP1__S_CMOV_B32()
-    {
-    } // ~Inst_SOP1__S_CMOV_B32
-
-    // --- description from .arch file ---
-    // (SCC) then D.u = S0.u;
-    // else NOP.
-    // Conditional move.
-    void
-    Inst_SOP1__S_CMOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = src.rawData();
-            sdst.write();
-        }
-    } // execute
-    // --- Inst_SOP1__S_CMOV_B64 class methods ---
-
-    Inst_SOP1__S_CMOV_B64::Inst_SOP1__S_CMOV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cmov_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_CMOV_B64
-
-    Inst_SOP1__S_CMOV_B64::~Inst_SOP1__S_CMOV_B64()
-    {
-    } // ~Inst_SOP1__S_CMOV_B64
-
-    // --- description from .arch file ---
-    // if(SCC) then D.u64 = S0.u64;
-    // else NOP.
-    // Conditional move.
-    void
-    Inst_SOP1__S_CMOV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-        scc.read();
-
-        if (scc.rawData()) {
-            sdst = src.rawData();
-            sdst.write();
-        }
-    } // execute
-    // --- Inst_SOP1__S_NOT_B32 class methods ---
-
-    Inst_SOP1__S_NOT_B32::Inst_SOP1__S_NOT_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_not_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_NOT_B32
-
-    Inst_SOP1__S_NOT_B32::~Inst_SOP1__S_NOT_B32()
-    {
-    } // ~Inst_SOP1__S_NOT_B32
-
-    // --- description from .arch file ---
-    // D.u = ~S0.u;
-    // SCC = 1 if result is non-zero.
-    // Bitwise negation.
-    void
-    Inst_SOP1__S_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = ~src.rawData();
-
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_NOT_B64 class methods ---
-
-    Inst_SOP1__S_NOT_B64::Inst_SOP1__S_NOT_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_not_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_NOT_B64
-
-    Inst_SOP1__S_NOT_B64::~Inst_SOP1__S_NOT_B64()
-    {
-    } // ~Inst_SOP1__S_NOT_B64
-
-    // --- description from .arch file ---
-    // D.u64 = ~S0.u64;
-    // SCC = 1 if result is non-zero.
-    // Bitwise negation.
-    void
-    Inst_SOP1__S_NOT_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = ~src.rawData();
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_WQM_B32 class methods ---
-
-    Inst_SOP1__S_WQM_B32::Inst_SOP1__S_WQM_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_wqm_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_WQM_B32
-
-    Inst_SOP1__S_WQM_B32::~Inst_SOP1__S_WQM_B32()
-    {
-    } // ~Inst_SOP1__S_WQM_B32
-
-    // --- description from .arch file ---
-    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
-    // Computes whole quad mode for an active/valid mask.
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_WQM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wholeQuadMode(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_WQM_B64 class methods ---
-
-    Inst_SOP1__S_WQM_B64::Inst_SOP1__S_WQM_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_wqm_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_WQM_B64
-
-    Inst_SOP1__S_WQM_B64::~Inst_SOP1__S_WQM_B64()
-    {
-    } // ~Inst_SOP1__S_WQM_B64
-
-    // --- description from .arch file ---
-    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
-    // Computes whole quad mode for an active/valid mask.
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_WQM_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wholeQuadMode(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BREV_B32 class methods ---
-
-    Inst_SOP1__S_BREV_B32::Inst_SOP1__S_BREV_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_brev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BREV_B32
-
-    Inst_SOP1__S_BREV_B32::~Inst_SOP1__S_BREV_B32()
-    {
-    } // ~Inst_SOP1__S_BREV_B32
-
-    // --- description from .arch file ---
-    // D.u[31:0] = S0.u[0:31] (reverse bits).
-    void
-    Inst_SOP1__S_BREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = reverseBits(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BREV_B64 class methods ---
-
-    Inst_SOP1__S_BREV_B64::Inst_SOP1__S_BREV_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_brev_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BREV_B64
-
-    Inst_SOP1__S_BREV_B64::~Inst_SOP1__S_BREV_B64()
-    {
-    } // ~Inst_SOP1__S_BREV_B64
-
-    // --- description from .arch file ---
-    // D.u64[63:0] = S0.u64[0:63] (reverse bits).
-    void
-    Inst_SOP1__S_BREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = reverseBits(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT0_I32_B32 class methods ---
-
-    Inst_SOP1__S_BCNT0_I32_B32::Inst_SOP1__S_BCNT0_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt0_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT0_I32_B32
-
-    Inst_SOP1__S_BCNT0_I32_B32::~Inst_SOP1__S_BCNT0_I32_B32()
-    {
-    } // ~Inst_SOP1__S_BCNT0_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = CountZeroBits(S0.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = countZeroBits(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT0_I32_B64 class methods ---
-
-    Inst_SOP1__S_BCNT0_I32_B64::Inst_SOP1__S_BCNT0_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt0_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT0_I32_B64
-
-    Inst_SOP1__S_BCNT0_I32_B64::~Inst_SOP1__S_BCNT0_I32_B64()
-    {
-    } // ~Inst_SOP1__S_BCNT0_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = CountZeroBits(S0.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = countZeroBits(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT1_I32_B32 class methods ---
-
-    Inst_SOP1__S_BCNT1_I32_B32::Inst_SOP1__S_BCNT1_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt1_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT1_I32_B32
-
-    Inst_SOP1__S_BCNT1_I32_B32::~Inst_SOP1__S_BCNT1_I32_B32()
-    {
-    } // ~Inst_SOP1__S_BCNT1_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = CountOneBits(S0.u);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = popCount(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_BCNT1_I32_B64 class methods ---
-
-    Inst_SOP1__S_BCNT1_I32_B64::Inst_SOP1__S_BCNT1_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bcnt1_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BCNT1_I32_B64
-
-    Inst_SOP1__S_BCNT1_I32_B64::~Inst_SOP1__S_BCNT1_I32_B64()
-    {
-    } // ~Inst_SOP1__S_BCNT1_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = CountOneBits(S0.u64);
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_BCNT1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = popCount(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_FF0_I32_B32 class methods ---
-
-    Inst_SOP1__S_FF0_I32_B32::Inst_SOP1__S_FF0_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff0_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF0_I32_B32
-
-    Inst_SOP1__S_FF0_I32_B32::~Inst_SOP1__S_FF0_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FF0_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = FindFirstZero(S0.u);
-    // If no zeros are found, return -1.
-    // Returns the bit position of the first zero from the LSB.
-    void
-    Inst_SOP1__S_FF0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstZero(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FF0_I32_B64 class methods ---
-
-    Inst_SOP1__S_FF0_I32_B64::Inst_SOP1__S_FF0_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff0_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF0_I32_B64
-
-    Inst_SOP1__S_FF0_I32_B64::~Inst_SOP1__S_FF0_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FF0_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = FindFirstZero(S0.u64);
-    // If no zeros are found, return -1.
-    // Returns the bit position of the first zero from the LSB.
-    void
-    Inst_SOP1__S_FF0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstZero(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FF1_I32_B32 class methods ---
-
-    Inst_SOP1__S_FF1_I32_B32::Inst_SOP1__S_FF1_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff1_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF1_I32_B32
-
-    Inst_SOP1__S_FF1_I32_B32::~Inst_SOP1__S_FF1_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FF1_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u);
-    // If no ones are found, return -1.
-    // Returns the bit position of the first one from the LSB.
-    void
-    Inst_SOP1__S_FF1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstOne(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FF1_I32_B64 class methods ---
-
-    Inst_SOP1__S_FF1_I32_B64::Inst_SOP1__S_FF1_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_ff1_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FF1_I32_B64
-
-    Inst_SOP1__S_FF1_I32_B64::~Inst_SOP1__S_FF1_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FF1_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u64);
-    // If no ones are found, return -1.
-    // Returns the bit position of the first one from the LSB.
-    void
-    Inst_SOP1__S_FF1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = findFirstOne(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32_B32 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32_B32::Inst_SOP1__S_FLBIT_I32_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_B32
-
-    Inst_SOP1__S_FLBIT_I32_B32::~Inst_SOP1__S_FLBIT_I32_B32()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_B32
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u);
-    // If no ones are found, return -1.
-    // Counts how many zeros before the first one starting from the MSB.
-    void
-    Inst_SOP1__S_FLBIT_I32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = countZeroBitsMsb(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32_B64 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32_B64::Inst_SOP1__S_FLBIT_I32_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_B64
-
-    Inst_SOP1__S_FLBIT_I32_B64::~Inst_SOP1__S_FLBIT_I32_B64()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_B64
-
-    // --- description from .arch file ---
-    // D.i = FindFirstOne(S0.u64);
-    // If no ones are found, return -1.
-    // Counts how many zeros before the first one starting from the MSB.
-    void
-    Inst_SOP1__S_FLBIT_I32_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = countZeroBitsMsb(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32::Inst_SOP1__S_FLBIT_I32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32
-
-    Inst_SOP1__S_FLBIT_I32::~Inst_SOP1__S_FLBIT_I32()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32
-
-    // --- description from .arch file ---
-    // D.i = FirstOppositeSignBit(S0.i);
-    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
-    // Counts how many bits in a row (from MSB to LSB) are the same as the
-    // sign bit.
-    void
-    Inst_SOP1__S_FLBIT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = firstOppositeSignBit(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_FLBIT_I32_I64 class methods ---
-
-    Inst_SOP1__S_FLBIT_I32_I64::Inst_SOP1__S_FLBIT_I32_I64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_flbit_i32_i64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_FLBIT_I32_I64
-
-    Inst_SOP1__S_FLBIT_I32_I64::~Inst_SOP1__S_FLBIT_I32_I64()
-    {
-    } // ~Inst_SOP1__S_FLBIT_I32_I64
-
-    // --- description from .arch file ---
-    // D.i = FirstOppositeSignBit(S0.i64);
-    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
-    // Counts how many bits in a row (from MSB to LSB) are the same as the
-    // sign bit.
-    void
-    Inst_SOP1__S_FLBIT_I32_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = firstOppositeSignBit(src.rawData());
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_SEXT_I32_I8 class methods ---
-
-    Inst_SOP1__S_SEXT_I32_I8::Inst_SOP1__S_SEXT_I32_I8(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_sext_i32_i8")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SEXT_I32_I8
-
-    Inst_SOP1__S_SEXT_I32_I8::~Inst_SOP1__S_SEXT_I32_I8()
-    {
-    } // ~Inst_SOP1__S_SEXT_I32_I8
-
-    // --- description from .arch file ---
-    // D.i = signext(S0.i[7:0]) (sign extension).
-    void
-    Inst_SOP1__S_SEXT_I32_I8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = sext<std::numeric_limits<ScalarRegI8>::digits>(
-            bits(src.rawData(), 7, 0));
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_SEXT_I32_I16 class methods ---
-
-    Inst_SOP1__S_SEXT_I32_I16::Inst_SOP1__S_SEXT_I32_I16(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_sext_i32_i16")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SEXT_I32_I16
-
-    Inst_SOP1__S_SEXT_I32_I16::~Inst_SOP1__S_SEXT_I32_I16()
-    {
-    } // ~Inst_SOP1__S_SEXT_I32_I16
-
-    // --- description from .arch file ---
-    // D.i = signext(S0.i[15:0]) (sign extension).
-    void
-    Inst_SOP1__S_SEXT_I32_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = sext<std::numeric_limits<ScalarRegI16>::digits>(
-            bits(src.rawData(), 15, 0));
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET0_B32 class methods ---
-
-    Inst_SOP1__S_BITSET0_B32::Inst_SOP1__S_BITSET0_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset0_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET0_B32
-
-    Inst_SOP1__S_BITSET0_B32::~Inst_SOP1__S_BITSET0_B32()
-    {
-    } // ~Inst_SOP1__S_BITSET0_B32
-
-    // --- description from .arch file ---
-    // D.u[S0.u[4:0]] = 0.
-    void
-    Inst_SOP1__S_BITSET0_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 4, 0), 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET0_B64 class methods ---
-
-    Inst_SOP1__S_BITSET0_B64::Inst_SOP1__S_BITSET0_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset0_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET0_B64
-
-    Inst_SOP1__S_BITSET0_B64::~Inst_SOP1__S_BITSET0_B64()
-    {
-    } // ~Inst_SOP1__S_BITSET0_B64
-
-    // --- description from .arch file ---
-    // D.u64[S0.u[5:0]] = 0.
-    void
-    Inst_SOP1__S_BITSET0_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 5, 0), 0);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET1_B32 class methods ---
-
-    Inst_SOP1__S_BITSET1_B32::Inst_SOP1__S_BITSET1_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset1_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET1_B32
-
-    Inst_SOP1__S_BITSET1_B32::~Inst_SOP1__S_BITSET1_B32()
-    {
-    } // ~Inst_SOP1__S_BITSET1_B32
-
-    // --- description from .arch file ---
-    // D.u[S0.u[4:0]] = 1.
-    void
-    Inst_SOP1__S_BITSET1_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 4, 0), 1);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_BITSET1_B64 class methods ---
-
-    Inst_SOP1__S_BITSET1_B64::Inst_SOP1__S_BITSET1_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_bitset1_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_BITSET1_B64
-
-    Inst_SOP1__S_BITSET1_B64::~Inst_SOP1__S_BITSET1_B64()
-    {
-    } // ~Inst_SOP1__S_BITSET1_B64
-
-    // --- description from .arch file ---
-    // D.u64[S0.u[5:0]] = 1.
-    void
-    Inst_SOP1__S_BITSET1_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst.setBit(bits(src.rawData(), 5, 0), 1);
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_GETPC_B64 class methods ---
-
-    Inst_SOP1__S_GETPC_B64::Inst_SOP1__S_GETPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_getpc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_GETPC_B64
-
-    Inst_SOP1__S_GETPC_B64::~Inst_SOP1__S_GETPC_B64()
-    {
-    } // ~Inst_SOP1__S_GETPC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = PC + 4.
-    // Destination receives the byte address of the next instruction.
-    void
-    Inst_SOP1__S_GETPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Addr pc = gpuDynInst->pc();
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        sdst = pc + 4;
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_SETPC_B64 class methods ---
-
-    Inst_SOP1__S_SETPC_B64::Inst_SOP1__S_SETPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_setpc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SETPC_B64
-
-    Inst_SOP1__S_SETPC_B64::~Inst_SOP1__S_SETPC_B64()
-    {
-    } // ~Inst_SOP1__S_SETPC_B64
-
-    // --- description from .arch file ---
-    // PC = S0.u64.
-    // S0.u64 is a byte address of the instruction to jump to.
-    void
-    Inst_SOP1__S_SETPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-
-        src.read();
-
-        wf->pc(src.rawData());
-    } // execute
-    // --- Inst_SOP1__S_SWAPPC_B64 class methods ---
-
-    Inst_SOP1__S_SWAPPC_B64::Inst_SOP1__S_SWAPPC_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_swappc_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_SWAPPC_B64
-
-    Inst_SOP1__S_SWAPPC_B64::~Inst_SOP1__S_SWAPPC_B64()
-    {
-    } // ~Inst_SOP1__S_SWAPPC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = PC + 4; PC = S0.u64.
-    // S0.u64 is a byte address of the instruction to jump to.
-    void
-    Inst_SOP1__S_SWAPPC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = pc + 4;
-
-        wf->pc(src.rawData());
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_RFE_B64 class methods ---
-
-    Inst_SOP1__S_RFE_B64::Inst_SOP1__S_RFE_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_rfe_b64")
-    {
-    } // Inst_SOP1__S_RFE_B64
-
-    Inst_SOP1__S_RFE_B64::~Inst_SOP1__S_RFE_B64()
-    {
-    } // ~Inst_SOP1__S_RFE_B64
-
-    // --- description from .arch file ---
-    // PRIV = 0;
-    // PC = S0.u64.
-    // Return from exception handler and continue.
-    // This instruction may only be used within a trap handler.
-    void
-    Inst_SOP1__S_RFE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_AND_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_AND_SAVEEXEC_B64::Inst_SOP1__S_AND_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_and_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_AND_SAVEEXEC_B64
-
-    Inst_SOP1__S_AND_SAVEEXEC_B64::~Inst_SOP1__S_AND_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_AND_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 & EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_AND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() & wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_OR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_OR_SAVEEXEC_B64::Inst_SOP1__S_OR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_or_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_OR_SAVEEXEC_B64
-
-    Inst_SOP1__S_OR_SAVEEXEC_B64::~Inst_SOP1__S_OR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_OR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 | EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_OR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() | wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_XOR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::Inst_SOP1__S_XOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_xor_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_XOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::~Inst_SOP1__S_XOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_XOR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 ^ EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_XOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() ^ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_ANDN2_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::Inst_SOP1__S_ANDN2_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_andn2_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_ANDN2_SAVEEXEC_B64
-
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::~Inst_SOP1__S_ANDN2_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_ANDN2_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 & ~EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() &~ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_ORN2_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::Inst_SOP1__S_ORN2_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_orn2_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_ORN2_SAVEEXEC_B64
-
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::~Inst_SOP1__S_ORN2_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_ORN2_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = S0.u64 | ~EXEC;
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_ORN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = src.rawData() |~ wf->execMask().to_ullong();
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_NAND_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::Inst_SOP1__S_NAND_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_nand_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_NAND_SAVEEXEC_B64
-
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::~Inst_SOP1__S_NAND_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_NAND_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 & EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_NAND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() & wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_NOR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::Inst_SOP1__S_NOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_nor_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_NOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::~Inst_SOP1__S_NOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_NOR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 | EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_NOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() | wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_XNOR_SAVEEXEC_B64 class methods ---
-
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::Inst_SOP1__S_XNOR_SAVEEXEC_B64(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_xnor_saveexec_b64")
-    {
-        setFlag(ALU);
-        setFlag(ReadsEXEC);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_XNOR_SAVEEXEC_B64
-
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::~Inst_SOP1__S_XNOR_SAVEEXEC_B64()
-    {
-    } // ~Inst_SOP1__S_XNOR_SAVEEXEC_B64
-
-    // --- description from .arch file ---
-    // D.u64 = EXEC;
-    // EXEC = ~(S0.u64 ^ EXEC);
-    // SCC = 1 if the new value of EXEC is non-zero.
-    void
-    Inst_SOP1__S_XNOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = wf->execMask().to_ullong();
-        wf->execMask() = ~(src.rawData() ^ wf->execMask().to_ullong());
-        scc = wf->execMask().any() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_QUADMASK_B32 class methods ---
-
-    Inst_SOP1__S_QUADMASK_B32::Inst_SOP1__S_QUADMASK_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_quadmask_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_QUADMASK_B32
-
-    Inst_SOP1__S_QUADMASK_B32::~Inst_SOP1__S_QUADMASK_B32()
-    {
-    } // ~Inst_SOP1__S_QUADMASK_B32
-
-    // --- description from .arch file ---
-    // D.u = QuadMask(S0.u):
-    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[31:8] = 0;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_QUADMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = quadMask(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_QUADMASK_B64 class methods ---
-
-    Inst_SOP1__S_QUADMASK_B64::Inst_SOP1__S_QUADMASK_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_quadmask_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_QUADMASK_B64
-
-    Inst_SOP1__S_QUADMASK_B64::~Inst_SOP1__S_QUADMASK_B64()
-    {
-    } // ~Inst_SOP1__S_QUADMASK_B64
-
-    // --- description from .arch file ---
-    // D.u64 = QuadMask(S0.u64):
-    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[63:16] = 0;
-    // SCC = 1 if result is non-zero.
-    void
-    Inst_SOP1__S_QUADMASK_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = quadMask(src.rawData());
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELS_B32 class methods ---
-
-    Inst_SOP1__S_MOVRELS_B32::Inst_SOP1__S_MOVRELS_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movrels_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELS_B32
-
-    Inst_SOP1__S_MOVRELS_B32::~Inst_SOP1__S_MOVRELS_B32()
-    {
-    } // ~Inst_SOP1__S_MOVRELS_B32
-
-    // --- description from .arch file ---
-    // D.u = SGPR[S0.u + M0.u].u (move from relative source).
-    void
-    Inst_SOP1__S_MOVRELS_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0 + m0.rawData());
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELS_B64 class methods ---
-
-    Inst_SOP1__S_MOVRELS_B64::Inst_SOP1__S_MOVRELS_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movrels_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELS_B64
-
-    Inst_SOP1__S_MOVRELS_B64::~Inst_SOP1__S_MOVRELS_B64()
-    {
-    } // ~Inst_SOP1__S_MOVRELS_B64
-
-    // --- description from .arch file ---
-    // D.u64 = SGPR[S0.u + M0.u].u64 (move from relative source).
-    // The index in M0.u must be even for this operation.
-    void
-    Inst_SOP1__S_MOVRELS_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0 + m0.rawData());
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELD_B32 class methods ---
-
-    Inst_SOP1__S_MOVRELD_B32::Inst_SOP1__S_MOVRELD_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movreld_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELD_B32
-
-    Inst_SOP1__S_MOVRELD_B32::~Inst_SOP1__S_MOVRELD_B32()
-    {
-    } // ~Inst_SOP1__S_MOVRELD_B32
-
-    // --- description from .arch file ---
-    // SGPR[D.u + M0.u].u = S0.u (move to relative destination).
-    void
-    Inst_SOP1__S_MOVRELD_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDST + m0.rawData());
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_MOVRELD_B64 class methods ---
-
-    Inst_SOP1__S_MOVRELD_B64::Inst_SOP1__S_MOVRELD_B64(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_movreld_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOVRELD_B64
-
-    Inst_SOP1__S_MOVRELD_B64::~Inst_SOP1__S_MOVRELD_B64()
-    {
-    } // ~Inst_SOP1__S_MOVRELD_B64
-
-    // --- description from .arch file ---
-    // SGPR[D.u + M0.u].u64 = S0.u64 (move to relative destination).
-    // The index in M0.u must be even for this operation.
-    void
-    Inst_SOP1__S_MOVRELD_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
-        m0.read();
-        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST + m0.rawData());
-
-        src.read();
-
-        sdst = src.rawData();
-
-        sdst.write();
-    } // execute
-    // --- Inst_SOP1__S_CBRANCH_JOIN class methods ---
-
-    Inst_SOP1__S_CBRANCH_JOIN::Inst_SOP1__S_CBRANCH_JOIN(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_cbranch_join")
-    {
-        setFlag(Branch);
-        setFlag(WritesEXEC);
-    } // Inst_SOP1__S_CBRANCH_JOIN
-
-    Inst_SOP1__S_CBRANCH_JOIN::~Inst_SOP1__S_CBRANCH_JOIN()
-    {
-    } // ~Inst_SOP1__S_CBRANCH_JOIN
-
-    // --- description from .arch file ---
-    // saved_csp = S0.u;
-    // if(CSP == saved_csp) then
-    //     PC += 4; // Second time to JOIN: continue with program.
-    // else
-    //     CSP -= 1; // First time to JOIN; jump to other FORK path.
-    //     {PC, EXEC} = SGPR[CSP * 4]; // Read 128 bits from 4 consecutive
-    //     SGPRs.
-    // end
-    // Conditional branch join point (end of conditional branch block). S0 is
-    // saved CSP value.
-    // See S_CBRANCH_G_FORK and S_CBRANCH_I_FORK for related instructions.
-    void
-    Inst_SOP1__S_CBRANCH_JOIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_ABS_I32 class methods ---
-
-    Inst_SOP1__S_ABS_I32::Inst_SOP1__S_ABS_I32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_abs_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_ABS_I32
-
-    Inst_SOP1__S_ABS_I32::~Inst_SOP1__S_ABS_I32()
-    {
-    } // ~Inst_SOP1__S_ABS_I32
-
-    // --- description from .arch file ---
-    // if(S.i < 0) then D.i = -S.i;
-    // else D.i = S.i;
-    // SCC = 1 if result is non-zero.
-    // Integer absolute value.
-    void
-    Inst_SOP1__S_ABS_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
-        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src.read();
-
-        sdst = std::abs(src.rawData());
-
-        scc = sdst.rawData() ? 1 : 0;
-
-        sdst.write();
-        scc.write();
-    } // execute
-    // --- Inst_SOP1__S_MOV_FED_B32 class methods ---
-
-    Inst_SOP1__S_MOV_FED_B32::Inst_SOP1__S_MOV_FED_B32(InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_mov_fed_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOP1__S_MOV_FED_B32
-
-    Inst_SOP1__S_MOV_FED_B32::~Inst_SOP1__S_MOV_FED_B32()
-    {
-    } // ~Inst_SOP1__S_MOV_FED_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u. Introduce an EDC double-detect error on write to the
-    // destination SGPR.
-    void
-    Inst_SOP1__S_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOP1__S_SET_GPR_IDX_IDX class methods ---
-
-    Inst_SOP1__S_SET_GPR_IDX_IDX::Inst_SOP1__S_SET_GPR_IDX_IDX(
-          InFmt_SOP1 *iFmt)
-        : Inst_SOP1(iFmt, "s_set_gpr_idx_idx")
-    {
-    } // Inst_SOP1__S_SET_GPR_IDX_IDX
-
-    Inst_SOP1__S_SET_GPR_IDX_IDX::~Inst_SOP1__S_SET_GPR_IDX_IDX()
-    {
-    } // ~Inst_SOP1__S_SET_GPR_IDX_IDX
-
-    // --- description from .arch file ---
-    // M0[7:0] = S0.u[7:0].
-    // Modify the index used in vector GPR indexing.
-    void
-    Inst_SOP1__S_SET_GPR_IDX_IDX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPC__S_CMP_EQ_I32 class methods ---
-
-    Inst_SOPC__S_CMP_EQ_I32::Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_I32
-
-    Inst_SOPC__S_CMP_EQ_I32::~Inst_SOPC__S_CMP_EQ_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i == S1.i).
-    void
-    Inst_SOPC__S_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LG_I32 class methods ---
-
-    Inst_SOPC__S_CMP_LG_I32::Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_I32
-
-    Inst_SOPC__S_CMP_LG_I32::~Inst_SOPC__S_CMP_LG_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i != S1.i).
-    void
-    Inst_SOPC__S_CMP_LG_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GT_I32 class methods ---
-
-    Inst_SOPC__S_CMP_GT_I32::Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GT_I32
-
-    Inst_SOPC__S_CMP_GT_I32::~Inst_SOPC__S_CMP_GT_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_GT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i > S1.i).
-    void
-    Inst_SOPC__S_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GE_I32 class methods ---
-
-    Inst_SOPC__S_CMP_GE_I32::Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GE_I32
-
-    Inst_SOPC__S_CMP_GE_I32::~Inst_SOPC__S_CMP_GE_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_GE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i >= S1.i).
-    void
-    Inst_SOPC__S_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LT_I32 class methods ---
-
-    Inst_SOPC__S_CMP_LT_I32::Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LT_I32
-
-    Inst_SOPC__S_CMP_LT_I32::~Inst_SOPC__S_CMP_LT_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LT_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i < S1.i).
-    void
-    Inst_SOPC__S_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LE_I32 class methods ---
-
-    Inst_SOPC__S_CMP_LE_I32::Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LE_I32
-
-    Inst_SOPC__S_CMP_LE_I32::~Inst_SOPC__S_CMP_LE_I32()
-    {
-    } // ~Inst_SOPC__S_CMP_LE_I32
-
-    // --- description from .arch file ---
-    // SCC = (S0.i <= S1.i).
-    void
-    Inst_SOPC__S_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_EQ_U32 class methods ---
-
-    Inst_SOPC__S_CMP_EQ_U32::Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_U32
-
-    Inst_SOPC__S_CMP_EQ_U32::~Inst_SOPC__S_CMP_EQ_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u == S1.u).
-    void
-    Inst_SOPC__S_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LG_U32 class methods ---
-
-    Inst_SOPC__S_CMP_LG_U32::Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_U32
-
-    Inst_SOPC__S_CMP_LG_U32::~Inst_SOPC__S_CMP_LG_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u != S1.u).
-    void
-    Inst_SOPC__S_CMP_LG_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GT_U32 class methods ---
-
-    Inst_SOPC__S_CMP_GT_U32::Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GT_U32
-
-    Inst_SOPC__S_CMP_GT_U32::~Inst_SOPC__S_CMP_GT_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_GT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u > S1.u).
-    void
-    Inst_SOPC__S_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_GE_U32 class methods ---
-
-    Inst_SOPC__S_CMP_GE_U32::Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_GE_U32
-
-    Inst_SOPC__S_CMP_GE_U32::~Inst_SOPC__S_CMP_GE_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_GE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u >= S1.u).
-    void
-    Inst_SOPC__S_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LT_U32 class methods ---
-
-    Inst_SOPC__S_CMP_LT_U32::Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LT_U32
-
-    Inst_SOPC__S_CMP_LT_U32::~Inst_SOPC__S_CMP_LT_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LT_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u < S1.u).
-    void
-    Inst_SOPC__S_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LE_U32 class methods ---
-
-    Inst_SOPC__S_CMP_LE_U32::Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LE_U32
-
-    Inst_SOPC__S_CMP_LE_U32::~Inst_SOPC__S_CMP_LE_U32()
-    {
-    } // ~Inst_SOPC__S_CMP_LE_U32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u <= S1.u).
-    void
-    Inst_SOPC__S_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP0_B32 class methods ---
-
-    Inst_SOPC__S_BITCMP0_B32::Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp0_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP0_B32
-
-    Inst_SOPC__S_BITCMP0_B32::~Inst_SOPC__S_BITCMP0_B32()
-    {
-    } // ~Inst_SOPC__S_BITCMP0_B32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u[S1.u[4:0]] == 0).
-    void
-    Inst_SOPC__S_BITCMP0_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = !bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP1_B32 class methods ---
-
-    Inst_SOPC__S_BITCMP1_B32::Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp1_b32")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP1_B32
-
-    Inst_SOPC__S_BITCMP1_B32::~Inst_SOPC__S_BITCMP1_B32()
-    {
-    } // ~Inst_SOPC__S_BITCMP1_B32
-
-    // --- description from .arch file ---
-    // SCC = (S0.u[S1.u[4:0]] == 1).
-    void
-    Inst_SOPC__S_BITCMP1_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP0_B64 class methods ---
-
-    Inst_SOPC__S_BITCMP0_B64::Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp0_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP0_B64
-
-    Inst_SOPC__S_BITCMP0_B64::~Inst_SOPC__S_BITCMP0_B64()
-    {
-    } // ~Inst_SOPC__S_BITCMP0_B64
-
-    // --- description from .arch file ---
-    // SCC = (S0.u64[S1.u[5:0]] == 0).
-    void
-    Inst_SOPC__S_BITCMP0_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = !bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_BITCMP1_B64 class methods ---
-
-    Inst_SOPC__S_BITCMP1_B64::Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_bitcmp1_b64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_BITCMP1_B64
-
-    Inst_SOPC__S_BITCMP1_B64::~Inst_SOPC__S_BITCMP1_B64()
-    {
-    } // ~Inst_SOPC__S_BITCMP1_B64
-
-    // --- description from .arch file ---
-    // SCC = (S0.u64[S1.u[5:0]] == 1).
-    void
-    Inst_SOPC__S_BITCMP1_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_SETVSKIP class methods ---
-
-    Inst_SOPC__S_SETVSKIP::Inst_SOPC__S_SETVSKIP(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_setvskip")
-    {
-    } // Inst_SOPC__S_SETVSKIP
-
-    Inst_SOPC__S_SETVSKIP::~Inst_SOPC__S_SETVSKIP()
-    {
-    } // ~Inst_SOPC__S_SETVSKIP
-
-    // --- description from .arch file ---
-    // VSKIP = S0.u[S1.u[4:0]].
-    // Enables and disables VSKIP mode.
-    // When VSKIP is enabled, no VOP*/M*BUF/MIMG/DS/FLAT/EXP instuctions are
-    // issued.
-    // If any vector operations are outstanding, S_WAITCNT must be issued
-    // before executing.
-    // This instruction requires one waitstate after executing (e.g. S_NOP 0).
-    // Example:
-    //     s_waitcnt 0
-    //     s_setvskip 1, 0  // Enable vskip mode.
-    //     s_nop 1
-    void
-    Inst_SOPC__S_SETVSKIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPC__S_SET_GPR_IDX_ON class methods ---
-
-    Inst_SOPC__S_SET_GPR_IDX_ON::Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_set_gpr_idx_on")
-    {
-    } // Inst_SOPC__S_SET_GPR_IDX_ON
-
-    Inst_SOPC__S_SET_GPR_IDX_ON::~Inst_SOPC__S_SET_GPR_IDX_ON()
-    {
-    } // ~Inst_SOPC__S_SET_GPR_IDX_ON
-
-    // --- description from .arch file ---
-    // MODE.gpr_idx_en = 1;
-    // M0[7:0] = S0.u[7:0];
-    // M0[15:12] = SIMM4 (direct contents of S1 field);
-    // // Remaining bits of M0 are unmodified.
-    // Enable GPR indexing mode. Vector operations after this will perform
-    // relative GPR addressing based on the contents of M0. The structure
-    // SQ_M0_GPR_IDX_WORD may be used to decode M0.
-    // The raw contents of the S1 field are read and used to set the enable
-    // bits. S1[0] = VSRC0_REL, S1[1] = VSRC1_REL, S1[2] = VSRC2_REL and
-    // S1[3] = VDST_REL.
-    void
-    Inst_SOPC__S_SET_GPR_IDX_ON::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPC__S_CMP_EQ_U64 class methods ---
-
-    Inst_SOPC__S_CMP_EQ_U64::Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_eq_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_EQ_U64
-
-    Inst_SOPC__S_CMP_EQ_U64::~Inst_SOPC__S_CMP_EQ_U64()
-    {
-    } // ~Inst_SOPC__S_CMP_EQ_U64
-
-    // --- description from .arch file ---
-    // SCC = (S0.i64 == S1.i64).
-    void
-    Inst_SOPC__S_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPC__S_CMP_LG_U64 class methods ---
-
-    Inst_SOPC__S_CMP_LG_U64::Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC *iFmt)
-        : Inst_SOPC(iFmt, "s_cmp_lg_u64")
-    {
-        setFlag(ALU);
-    } // Inst_SOPC__S_CMP_LG_U64
-
-    Inst_SOPC__S_CMP_LG_U64::~Inst_SOPC__S_CMP_LG_U64()
-    {
-    } // ~Inst_SOPC__S_CMP_LG_U64
-
-    // --- description from .arch file ---
-    // SCC = (S0.i64 != S1.i64).
-    void
-    Inst_SOPC__S_CMP_LG_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
-        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
-        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        src0.read();
-        src1.read();
-
-        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
-
-        scc.write();
-    } // execute
-    // --- Inst_SOPP__S_NOP class methods ---
-
-    Inst_SOPP__S_NOP::Inst_SOPP__S_NOP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_nop")
-    {
-        setFlag(Nop);
-    } // Inst_SOPP__S_NOP
-
-    Inst_SOPP__S_NOP::~Inst_SOPP__S_NOP()
-    {
-    } // ~Inst_SOPP__S_NOP
-
-    // --- description from .arch file ---
-    // Do nothing. Repeat NOP 1..8 times based on SIMM16[2:0] -- 0 = 1 time,
-    // 7 = 8 times.
-    // This instruction may be used to introduce wait states to resolve
-    // hazards; see the shader programming guide for details. Compare with
-    // S_SLEEP.
-    void
-    Inst_SOPP__S_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_SOPP__S_ENDPGM class methods ---
-
-    Inst_SOPP__S_ENDPGM::Inst_SOPP__S_ENDPGM(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_endpgm")
-    {
-        setFlag(EndOfKernel);
-    } // Inst_SOPP__S_ENDPGM
-
-    Inst_SOPP__S_ENDPGM::~Inst_SOPP__S_ENDPGM()
-    {
-    } // ~Inst_SOPP__S_ENDPGM
-
-    // --- description from .arch file ---
-    // End of program; terminate wavefront.
-    // The hardware implicitly executes S_WAITCNT 0 before executing this
-    // ---  instruction.
-    // See S_ENDPGM_SAVED for the context-switch version of this instruction.
-    void
-    Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ComputeUnit *cu = gpuDynInst->computeUnit();
-
-        // delete extra instructions fetched for completed work-items
-        wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
-            wf->instructionBuffer.end());
-
-        if (wf->pendingFetch) {
-            wf->dropFetch = true;
-        }
-
-        wf->computeUnit->fetchStage.fetchUnit(wf->simdId)
-            .flushBuf(wf->wfSlotId);
-        wf->setStatus(Wavefront::S_STOPPED);
-
-        int refCount = wf->computeUnit->getLds()
-            .decreaseRefCounter(wf->dispatchId, wf->wgId);
-
-        /**
-         * The parent WF of this instruction is exiting, therefore
-         * it should not participate in this barrier any longer. This
-         * prevents possible deadlock issues if WFs exit early.
-         */
-        int bar_id = WFBarrier::InvalidID;
-        if (wf->hasBarrier()) {
-            assert(wf->getStatus() != Wavefront::S_BARRIER);
-            bar_id = wf->barrierId();
-            assert(bar_id != WFBarrier::InvalidID);
-            wf->releaseBarrier();
-            cu->decMaxBarrierCnt(bar_id);
-            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
-                    "program and decrementing max barrier count for "
-                    "barrier Id%d. New max count: %d.\n", cu->cu_id,
-                    wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
-                    cu->maxBarrierCnt(bar_id));
-        }
-
-        DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
-            wf->computeUnit->cu_id, wf->wgId, refCount);
-
-        wf->computeUnit->registerManager->freeRegisters(wf);
-        wf->computeUnit->stats.completedWfs++;
-        wf->computeUnit->activeWaves--;
-
-        panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
-            "than zero\n", wf->computeUnit->cu_id);
-
-        DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
-            wf->computeUnit->cu_id, wf->simdId, wf->wfSlotId, wf->wfDynId);
-
-        for (int i = 0; i < wf->vecReads.size(); i++) {
-            if (wf->rawDist.find(i) != wf->rawDist.end()) {
-                wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
-            }
-        }
-        wf->vecReads.clear();
-        wf->rawDist.clear();
-        wf->lastInstExec = 0;
-
-        if (!refCount) {
-            /**
-             * If all WFs have finished, and hence the WG has finished,
-             * then we can free up the barrier belonging to the parent
-             * WG, but only if we actually used a barrier (i.e., more
-             * than one WF in the WG).
-             */
-            if (bar_id != WFBarrier::InvalidID) {
-                DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
-                        "now complete. Releasing barrier Id%d.\n", cu->cu_id,
-                        wf->simdId, wf->wfSlotId, wf->wfDynId,
-                        wf->barrierId());
-                cu->releaseBarrier(bar_id);
-            }
-
-           /**
-             * Last wavefront of the workgroup has executed return. If the
-             * workgroup is not the final one in the kernel, then simply
-             * retire it; however, if it is the final one, i.e., indicating
-             * the kernel end, then release operation (i.e., GL2 WB) is
-             * needed
-             */
-
-            //check whether the workgroup is indicating the kernel end, i.e.,
-            //the last workgroup in the kernel
-            bool kernelEnd =
-                wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
-
-            bool relNeeded =
-                wf->computeUnit->shader->impl_kern_end_rel;
-
-            //if it is not a kernel end, then retire the workgroup directly
-            if (!kernelEnd || !relNeeded) {
-                wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
-                wf->setStatus(Wavefront::S_STOPPED);
-                wf->computeUnit->stats.completedWGs++;
-
-                return;
-            }
-
-            /**
-             * if it is a kernel end, inject a memory sync, i.e., GL2 WB, and
-             * retire the workgroup after receving response.
-             * note that GL0V and GL1 are read only, and they just forward GL2
-             * WB request. When forwarding, GL1 send the request to all GL2 in
-             * the complex
-             */
-            setFlag(MemSync);
-            setFlag(GlobalSegment);
-            // Notify Memory System of Kernel Completion
-            // Kernel End = isKernel + isMemSync
-            wf->setStatus(Wavefront::S_RETURNING);
-            gpuDynInst->simdId = wf->simdId;
-            gpuDynInst->wfSlotId = wf->wfSlotId;
-            gpuDynInst->wfDynId = wf->wfDynId;
-
-            DPRINTF(GPUExec, "inject global memory fence for CU%d: "
-                            "WF[%d][%d][%d]\n", wf->computeUnit->cu_id,
-                            wf->simdId, wf->wfSlotId, wf->wfDynId);
-
-            // call shader to prepare the flush operations
-            wf->computeUnit->shader->prepareFlush(gpuDynInst);
-
-            wf->computeUnit->stats.completedWGs++;
-        } else {
-            wf->computeUnit->shader->dispatcher().scheduleDispatch();
-        }
-    } // execute
-
-    // --- Inst_SOPP__S_BRANCH class methods ---
-
-    Inst_SOPP__S_BRANCH::Inst_SOPP__S_BRANCH(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_branch")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_BRANCH
-
-    Inst_SOPP__S_BRANCH::~Inst_SOPP__S_BRANCH()
-    {
-    } // ~Inst_SOPP__S_BRANCH
-
-    // --- description from .arch file ---
-    // PC = PC + signext(SIMM16 * 4) + 4 (short jump).
-    // For a long jump, use S_SETPC.
-    void
-    Inst_SOPP__S_BRANCH::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-
-        pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_WAKEUP class methods ---
-
-    Inst_SOPP__S_WAKEUP::Inst_SOPP__S_WAKEUP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_wakeup")
-    {
-    } // Inst_SOPP__S_WAKEUP
-
-    Inst_SOPP__S_WAKEUP::~Inst_SOPP__S_WAKEUP()
-    {
-    } // ~Inst_SOPP__S_WAKEUP
-
-    // --- description from .arch file ---
-    // Allow a wave to 'ping' all the other waves in its threadgroup to force
-    // them to wake up immediately from an S_SLEEP instruction. The ping is
-    // ignored if the waves are not sleeping.
-    // This allows for more efficient polling on a memory location. The waves
-    // which are polling can sit in a long S_SLEEP between memory reads, but
-    // the wave which writes the value can tell them all to wake up early now
-    // that the data is available. This is useful for fBarrier implementations
-    // (speedup).
-    // This method is also safe from races because if any wave misses the ping,
-    // everything still works fine (whoever missed it just completes their
-    // normal S_SLEEP).
-    void
-    Inst_SOPP__S_WAKEUP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_SCC0 class methods ---
-
-    Inst_SOPP__S_CBRANCH_SCC0::Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_scc0")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_SCC0
-
-    Inst_SOPP__S_CBRANCH_SCC0::~Inst_SOPP__S_CBRANCH_SCC0()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_SCC0
-
-    // --- description from .arch file ---
-    // if(SCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_SCC0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (!scc.rawData()) {
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-        }
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_SCC1 class methods ---
-
-    Inst_SOPP__S_CBRANCH_SCC1::Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_scc1")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_SCC1
-
-    Inst_SOPP__S_CBRANCH_SCC1::~Inst_SOPP__S_CBRANCH_SCC1()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_SCC1
-
-    // --- description from .arch file ---
-    // if(SCC == 1) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_SCC1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
-
-        scc.read();
-
-        if (scc.rawData()) {
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-        }
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_VCCZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_VCCZ::Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_vccz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsVCC);
-    } // Inst_SOPP__S_CBRANCH_VCCZ
-
-    Inst_SOPP__S_CBRANCH_VCCZ::~Inst_SOPP__S_CBRANCH_VCCZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_VCCZ
-
-    // --- description from .arch file ---
-    // if(VCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_VCCZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-        Addr pc = gpuDynInst->pc();
-        ScalarRegI16 simm16 = instData.SIMM16;
-
-        vcc.read();
-
-        if (!vcc.rawData()) {
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-        }
-
-        wf->pc(pc);
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_VCCNZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_VCCNZ::Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_vccnz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsVCC);
-    } // Inst_SOPP__S_CBRANCH_VCCNZ
-
-    Inst_SOPP__S_CBRANCH_VCCNZ::~Inst_SOPP__S_CBRANCH_VCCNZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_VCCNZ
-
-    // --- description from .arch file ---
-    // if(VCC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_VCCNZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        vcc.read();
-
-        if (vcc.rawData()) {
-            Addr pc = gpuDynInst->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-            wf->pc(pc);
-        }
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_EXECZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_EXECZ::Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_execz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsEXEC);
-    } // Inst_SOPP__S_CBRANCH_EXECZ
-
-    Inst_SOPP__S_CBRANCH_EXECZ::~Inst_SOPP__S_CBRANCH_EXECZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_EXECZ
-
-    // --- description from .arch file ---
-    // if(EXEC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_EXECZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (wf->execMask().none()) {
-            Addr pc = gpuDynInst->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-            wf->pc(pc);
-        }
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_EXECNZ class methods ---
-
-    Inst_SOPP__S_CBRANCH_EXECNZ::Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_execnz")
-    {
-        setFlag(Branch);
-        setFlag(ReadsEXEC);
-    } // Inst_SOPP__S_CBRANCH_EXECNZ
-
-    Inst_SOPP__S_CBRANCH_EXECNZ::~Inst_SOPP__S_CBRANCH_EXECNZ()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_EXECNZ
-
-    // --- description from .arch file ---
-    // if(EXEC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_EXECNZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (wf->execMask().any()) {
-            Addr pc = gpuDynInst->pc();
-            ScalarRegI16 simm16 = instData.SIMM16;
-            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
-            wf->pc(pc);
-        }
-    } // execute
-    // --- Inst_SOPP__S_BARRIER class methods ---
-
-    Inst_SOPP__S_BARRIER::Inst_SOPP__S_BARRIER(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_barrier")
-    {
-        setFlag(MemBarrier);
-    } // Inst_SOPP__S_BARRIER
-
-    Inst_SOPP__S_BARRIER::~Inst_SOPP__S_BARRIER()
-    {
-    } // ~Inst_SOPP__S_BARRIER
-
-    // --- description from .arch file ---
-    // Synchronize waves within a threadgroup.
-    // If not all waves of the threadgroup have been created yet, waits for
-    // entire group before proceeding.
-    // If some waves in the threadgroup have already terminated, this waits on
-    // only the surviving waves.
-    // Barriers are legal inside trap handlers.
-    void
-    Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ComputeUnit *cu = gpuDynInst->computeUnit();
-
-        if (wf->hasBarrier()) {
-            int bar_id = wf->barrierId();
-            assert(wf->getStatus() == Wavefront::S_BARRIER);
-            cu->incNumAtBarrier(bar_id);
-            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
-                    "barrier Id%d. %d waves now at barrier, %d waves "
-                    "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
-                    wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
-                    cu->numYetToReachBarrier(bar_id));
-        }
-    } // execute
-    // --- Inst_SOPP__S_SETKILL class methods ---
-
-    Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_setkill")
-    {
-    } // Inst_SOPP__S_SETKILL
-
-    Inst_SOPP__S_SETKILL::~Inst_SOPP__S_SETKILL()
-    {
-    } // ~Inst_SOPP__S_SETKILL
-
-    // --- description from .arch file ---
-    // set KILL bit to value of SIMM16[0].
-    // Used primarily for debugging kill wave host command behavior.
-    void
-    Inst_SOPP__S_SETKILL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_WAITCNT class methods ---
-
-    Inst_SOPP__S_WAITCNT::Inst_SOPP__S_WAITCNT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_waitcnt")
-    {
-        setFlag(ALU);
-        setFlag(Waitcnt);
-    } // Inst_SOPP__S_WAITCNT
-
-    Inst_SOPP__S_WAITCNT::~Inst_SOPP__S_WAITCNT()
-    {
-    } // ~Inst_SOPP__S_WAITCNT
-
-    // --- description from .arch file ---
-    // Wait for the counts of outstanding lds, vector-memory and
-    // ---  export/vmem-write-data to be at or below the specified levels.
-    // SIMM16[3:0] = vmcount (vector memory operations),
-    // SIMM16[6:4] = export/mem-write-data count,
-    // SIMM16[12:8] = LGKM_cnt (scalar-mem/GDS/LDS count).
-    void
-    Inst_SOPP__S_WAITCNT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 vm_cnt = 0;
-        ScalarRegI32 exp_cnt = 0;
-        ScalarRegI32 lgkm_cnt = 0;
-        vm_cnt = bits<ScalarRegI16>(instData.SIMM16, 3, 0);
-        exp_cnt = bits<ScalarRegI16>(instData.SIMM16, 6, 4);
-        lgkm_cnt = bits<ScalarRegI16>(instData.SIMM16, 12, 8);
-        gpuDynInst->wavefront()->setStatus(Wavefront::S_WAITCNT);
-        gpuDynInst->wavefront()->setWaitCnts(vm_cnt, exp_cnt, lgkm_cnt);
-    } // execute
-    // --- Inst_SOPP__S_SETHALT class methods ---
-
-    Inst_SOPP__S_SETHALT::Inst_SOPP__S_SETHALT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sethalt")
-    {
-    } // Inst_SOPP__S_SETHALT
-
-    Inst_SOPP__S_SETHALT::~Inst_SOPP__S_SETHALT()
-    {
-    } // ~Inst_SOPP__S_SETHALT
-
-    // --- description from .arch file ---
-    // Set HALT bit to value of SIMM16[0]; 1 = halt, 0 = resume.
-    // The halt flag is ignored while PRIV == 1 (inside trap handlers) but the
-    // shader will halt immediately after the handler returns if HALT is still
-    // set at that time.
-    void
-    Inst_SOPP__S_SETHALT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SLEEP class methods ---
-
-    Inst_SOPP__S_SLEEP::Inst_SOPP__S_SLEEP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sleep")
-    {
-        setFlag(ALU);
-        setFlag(Sleep);
-    } // Inst_SOPP__S_SLEEP
-
-    Inst_SOPP__S_SLEEP::~Inst_SOPP__S_SLEEP()
-    {
-    } // ~Inst_SOPP__S_SLEEP
-
-    // --- description from .arch file ---
-    // Cause a wave to sleep for (64 * SIMM16[2:0] + 1..64) clocks.
-    // The exact amount of delay is approximate. Compare with S_NOP.
-    void
-    Inst_SOPP__S_SLEEP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
-        gpuDynInst->wavefront()->setStatus(Wavefront::S_STALLED_SLEEP);
-        // sleep duration is specified in multiples of 64 cycles
-        gpuDynInst->wavefront()->setSleepTime(64 * simm16);
-    } // execute
-    // --- Inst_SOPP__S_SETPRIO class methods ---
-
-    Inst_SOPP__S_SETPRIO::Inst_SOPP__S_SETPRIO(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_setprio")
-    {
-        setFlag(ALU);
-    } // Inst_SOPP__S_SETPRIO
-
-    Inst_SOPP__S_SETPRIO::~Inst_SOPP__S_SETPRIO()
-    {
-    } // ~Inst_SOPP__S_SETPRIO
-
-    // --- description from .arch file ---
-    // User settable wave priority is set to SIMM16[1:0]. 0 = lowest,
-    // 3 = highest.
-    // The overall wave priority is {SPIPrio[1:0] + UserPrio[1:0],
-    // WaveAge[3:0]}.
-    void
-    Inst_SOPP__S_SETPRIO::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarRegU16 simm16 = instData.SIMM16;
-        ScalarRegU32 userPrio = simm16 & 0x3;
-
-        warn_once("S_SETPRIO ignored -- Requested priority %d\n", userPrio);
-    } // execute
-    // --- Inst_SOPP__S_SENDMSG class methods ---
-
-    Inst_SOPP__S_SENDMSG::Inst_SOPP__S_SENDMSG(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sendmsg")
-    {
-    } // Inst_SOPP__S_SENDMSG
-
-    Inst_SOPP__S_SENDMSG::~Inst_SOPP__S_SENDMSG()
-    {
-    } // ~Inst_SOPP__S_SENDMSG
-
-    // --- description from .arch file ---
-    // Send a message upstream to VGT or the interrupt handler.
-    // SIMM16[9:0] contains the message type and is documented in the shader
-    // ---  programming guide.
-    void
-    Inst_SOPP__S_SENDMSG::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SENDMSGHALT class methods ---
-
-    Inst_SOPP__S_SENDMSGHALT::Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_sendmsghalt")
-    {
-    } // Inst_SOPP__S_SENDMSGHALT
-
-    Inst_SOPP__S_SENDMSGHALT::~Inst_SOPP__S_SENDMSGHALT()
-    {
-    } // ~Inst_SOPP__S_SENDMSGHALT
-
-    // --- description from .arch file ---
-    // Send a message and then HALT the wavefront; see S_SENDMSG for details.
-    void
-    Inst_SOPP__S_SENDMSGHALT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_TRAP class methods ---
-
-    Inst_SOPP__S_TRAP::Inst_SOPP__S_TRAP(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_trap")
-    {
-    } // Inst_SOPP__S_TRAP
-
-    Inst_SOPP__S_TRAP::~Inst_SOPP__S_TRAP()
-    {
-    } // ~Inst_SOPP__S_TRAP
-
-    // --- description from .arch file ---
-    // TrapID = SIMM16[7:0];
-    // Wait for all instructions to complete;
-    // set {TTMP1, TTMP0} = {3'h0, PCRewind[3:0], HT[0], TrapID[7:0],
-    // PC[47:0]};
-    // PC = TBA (trap base address);
-    // PRIV = 1.
-    // Enter the trap handler. This instruction may be generated internally as
-    // well in response to a host trap (HT = 1) or an exception.
-    // TrapID 0 is reserved for hardware use and should not be used in a
-    // shader-generated trap.
-    void
-    Inst_SOPP__S_TRAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_ICACHE_INV class methods ---
-
-    Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_icache_inv")
-    {
-    } // Inst_SOPP__S_ICACHE_INV
-
-    Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
-    {
-    } // ~Inst_SOPP__S_ICACHE_INV
-
-    // --- description from .arch file ---
-    // Invalidate entire L1 instruction cache.
-    // You must have 12 separate S_NOP instructions or a jump/branch
-    // instruction after this instruction
-    // to ensure the SQ instruction buffer is purged.
-    void
-    Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_INCPERFLEVEL class methods ---
-
-    Inst_SOPP__S_INCPERFLEVEL::Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_incperflevel")
-    {
-    } // Inst_SOPP__S_INCPERFLEVEL
-
-    Inst_SOPP__S_INCPERFLEVEL::~Inst_SOPP__S_INCPERFLEVEL()
-    {
-    } // ~Inst_SOPP__S_INCPERFLEVEL
-
-    // --- description from .arch file ---
-    // Increment performance counter specified in SIMM16[3:0] by 1.
-    void
-    Inst_SOPP__S_INCPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_DECPERFLEVEL class methods ---
-
-    Inst_SOPP__S_DECPERFLEVEL::Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_decperflevel")
-    {
-    } // Inst_SOPP__S_DECPERFLEVEL
-
-    Inst_SOPP__S_DECPERFLEVEL::~Inst_SOPP__S_DECPERFLEVEL()
-    {
-    } // ~Inst_SOPP__S_DECPERFLEVEL
-
-    // --- description from .arch file ---
-    // Decrement performance counter specified in SIMM16[3:0] by 1.
-    void
-    Inst_SOPP__S_DECPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_TTRACEDATA class methods ---
-
-    Inst_SOPP__S_TTRACEDATA::Inst_SOPP__S_TTRACEDATA(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_ttracedata")
-    {
-    } // Inst_SOPP__S_TTRACEDATA
-
-    Inst_SOPP__S_TTRACEDATA::~Inst_SOPP__S_TTRACEDATA()
-    {
-    } // ~Inst_SOPP__S_TTRACEDATA
-
-    // --- description from .arch file ---
-    // Send M0 as user data to the thread trace stream.
-    void
-    Inst_SOPP__S_TTRACEDATA::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGSYS class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS::Inst_SOPP__S_CBRANCH_CDBGSYS(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS::~Inst_SOPP__S_CBRANCH_CDBGSYS()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS
-
-    // --- description from .arch file ---
-    // if(conditional_debug_system != 0) then PC = PC + signext(SIMM16 * 4)
-    // + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGUSER class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGUSER::Inst_SOPP__S_CBRANCH_CDBGUSER(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbguser")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGUSER
-
-    Inst_SOPP__S_CBRANCH_CDBGUSER::~Inst_SOPP__S_CBRANCH_CDBGUSER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGUSER
-
-    // --- description from .arch file ---
-    // if(conditional_debug_user != 0) then PC = PC + signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGUSER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_or_user")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::
-        ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
-
-    // --- description from .arch file ---
-    // if(conditional_debug_system || conditional_debug_user) then PC = PC +
-    // ---  signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER class methods ---
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
-        Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP *iFmt)
-            : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_and_user")
-    {
-        setFlag(Branch);
-    } // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
-
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
-        ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER()
-    {
-    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
-
-    // --- description from .arch file ---
-    // if(conditional_debug_system && conditional_debug_user) then PC = PC +
-    // ---  signext(SIMM16 * 4) + 4;
-    // else NOP.
-    void
-    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_ENDPGM_SAVED class methods ---
-
-    Inst_SOPP__S_ENDPGM_SAVED::Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_endpgm_saved")
-    {
-    } // Inst_SOPP__S_ENDPGM_SAVED
-
-    Inst_SOPP__S_ENDPGM_SAVED::~Inst_SOPP__S_ENDPGM_SAVED()
-    {
-    } // ~Inst_SOPP__S_ENDPGM_SAVED
-
-    // --- description from .arch file ---
-    // End of program; signal that a wave has been saved by the context-switch
-    // trap handler and terminate wavefront.
-    // The hardware implicitly executes S_WAITCNT 0 before executing this
-    // instruction.
-    // Use S_ENDPGM in all cases unless you are executing the context-switch
-    // save handler.
-    void
-    Inst_SOPP__S_ENDPGM_SAVED::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SET_GPR_IDX_OFF class methods ---
-
-    Inst_SOPP__S_SET_GPR_IDX_OFF::Inst_SOPP__S_SET_GPR_IDX_OFF(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_set_gpr_idx_off")
-    {
-    } // Inst_SOPP__S_SET_GPR_IDX_OFF
-
-    Inst_SOPP__S_SET_GPR_IDX_OFF::~Inst_SOPP__S_SET_GPR_IDX_OFF()
-    {
-    } // ~Inst_SOPP__S_SET_GPR_IDX_OFF
-
-    // --- description from .arch file ---
-    // MODE.gpr_idx_en = 0.
-    // Clear GPR indexing mode. Vector operations after this will not perform
-    // ---  relative GPR addressing regardless of the contents of M0. This
-    // ---  instruction does not modify M0.
-    void
-    Inst_SOPP__S_SET_GPR_IDX_OFF::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SOPP__S_SET_GPR_IDX_MODE class methods ---
-
-    Inst_SOPP__S_SET_GPR_IDX_MODE::Inst_SOPP__S_SET_GPR_IDX_MODE(
-          InFmt_SOPP *iFmt)
-        : Inst_SOPP(iFmt, "s_set_gpr_idx_mode")
-    {
-    } // Inst_SOPP__S_SET_GPR_IDX_MODE
-
-    Inst_SOPP__S_SET_GPR_IDX_MODE::~Inst_SOPP__S_SET_GPR_IDX_MODE()
-    {
-    } // ~Inst_SOPP__S_SET_GPR_IDX_MODE
-
-    // --- description from .arch file ---
-    // M0[15:12] = SIMM4.
-    // Modify the mode used for vector GPR indexing.
-    // The raw contents of the source field are read and used to set the enable
-    // bits. SIMM4[0] = VSRC0_REL, SIMM4[1] = VSRC1_REL, SIMM4[2] = VSRC2_REL
-    // and SIMM4[3] = VDST_REL.
-    void
-    Inst_SOPP__S_SET_GPR_IDX_MODE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_LOAD_DWORD class methods ---
-
-    Inst_SMEM__S_LOAD_DWORD::Inst_SMEM__S_LOAD_DWORD(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORD
-
-    Inst_SMEM__S_LOAD_DWORD::~Inst_SMEM__S_LOAD_DWORD()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORD
-
-    /**
-     * Read 1 dword from scalar data cache. If the offset is specified as an
-     * sgpr, the sgpr contains an unsigned byte offset (the 2 LSBs are
-     * ignored). If the offset is specified as an immediate 20-bit constant,
-     * the constant is an unsigned byte offset.
-     */
-    void
-    Inst_SMEM__S_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX2 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX2::Inst_SMEM__S_LOAD_DWORDX2(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX2
-
-    Inst_SMEM__S_LOAD_DWORDX2::~Inst_SMEM__S_LOAD_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX2
-
-    /**
-     * Read 2 dwords from scalar data cache. See s_load_dword for details on
-     * the offset input.
-     */
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX4 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX4::Inst_SMEM__S_LOAD_DWORDX4(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX4
-
-    Inst_SMEM__S_LOAD_DWORDX4::~Inst_SMEM__S_LOAD_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX8 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX8::Inst_SMEM__S_LOAD_DWORDX8(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX8
-
-    Inst_SMEM__S_LOAD_DWORDX8::~Inst_SMEM__S_LOAD_DWORDX8()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX8
-
-    // --- description from .arch file ---
-    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_LOAD_DWORDX16 class methods ---
-
-    Inst_SMEM__S_LOAD_DWORDX16::Inst_SMEM__S_LOAD_DWORDX16(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_load_dwordx16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_LOAD_DWORDX16
-
-    Inst_SMEM__S_LOAD_DWORDX16::~Inst_SMEM__S_LOAD_DWORDX16()
-    {
-    } // ~Inst_SMEM__S_LOAD_DWORDX16
-
-    // --- description from .arch file ---
-    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-
-        addr.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORD class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::Inst_SMEM__S_BUFFER_LOAD_DWORD(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORD
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::~Inst_SMEM__S_BUFFER_LOAD_DWORD()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORD
-
-    // --- description from .arch file ---
-    // Read 1 dword from scalar data cache. See S_LOAD_DWORD for details on the
-    // ---  offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 1 request, size 32
-        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX2 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX2
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::~Inst_SMEM__S_BUFFER_LOAD_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX2
-
-    // --- description from .arch file ---
-    // Read 2 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // use U64 because 2 requests, each size 32
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX4 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX4
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::~Inst_SMEM__S_BUFFER_LOAD_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 4 requests, each size 32
-        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX8 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::Inst_SMEM__S_BUFFER_LOAD_DWORDX8(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX8
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::~Inst_SMEM__S_BUFFER_LOAD_DWORDX8()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX8
-
-    // --- description from .arch file ---
-    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 8 requests, each size 32
-        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX16 class methods ---
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::Inst_SMEM__S_BUFFER_LOAD_DWORDX16(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_load_dwordx16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX16
-
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::~Inst_SMEM__S_BUFFER_LOAD_DWORDX16()
-    {
-    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX16
-
-    // --- description from .arch file ---
-    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
-
-        rsrcDesc.read();
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, rsrcDesc, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe
-            .issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // 16 requests, each size 32
-        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
-        sdst.write();
-    } // completeAcc
-    // --- Inst_SMEM__S_STORE_DWORD class methods ---
-
-    Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORD
-
-    Inst_SMEM__S_STORE_DWORD::~Inst_SMEM__S_STORE_DWORD()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Write 1 dword to scalar data cache.
-    // If the offset is specified as an SGPR, the SGPR contains an unsigned
-    // BYTE offset (the 2 LSBs are ignored).
-    // If the offset is specified as an immediate 20-bit constant, the
-    // constant is an unsigned BYTE offset.
-    void
-    Inst_SMEM__S_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(ScalarRegU32));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<1>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_STORE_DWORDX2 class methods ---
-
-    Inst_SMEM__S_STORE_DWORDX2::Inst_SMEM__S_STORE_DWORDX2(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORDX2
-
-    Inst_SMEM__S_STORE_DWORDX2::~Inst_SMEM__S_STORE_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(ScalarRegU64));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_STORE_DWORDX4 class methods ---
-
-    Inst_SMEM__S_STORE_DWORDX4::Inst_SMEM__S_STORE_DWORDX4(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_STORE_DWORDX4
-
-    Inst_SMEM__S_STORE_DWORDX4::~Inst_SMEM__S_STORE_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-        ScalarRegU32 offset(0);
-        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
-        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
-
-        addr.read();
-        sdata.read();
-
-        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
-            sizeof(gpuDynInst->scalar_data));
-
-        if (instData.IMM) {
-            offset = extData.OFFSET;
-        } else {
-            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
-            off_sgpr.read();
-            offset = off_sgpr.rawData();
-        }
-
-        calcAddr(gpuDynInst, addr, offset);
-
-        gpuDynInst->computeUnit()->scalarMemoryPipe.
-            issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_STORE_DWORD class methods ---
-
-    Inst_SMEM__S_BUFFER_STORE_DWORD::Inst_SMEM__S_BUFFER_STORE_DWORD(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORD
-
-    Inst_SMEM__S_BUFFER_STORE_DWORD::~Inst_SMEM__S_BUFFER_STORE_DWORD()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Write 1 dword to scalar data cache. See S_STORE_DWORD for details on the
-    // ---  offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX2 class methods ---
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::Inst_SMEM__S_BUFFER_STORE_DWORDX2(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORDX2
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::~Inst_SMEM__S_BUFFER_STORE_DWORDX2()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX4 class methods ---
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::Inst_SMEM__S_BUFFER_STORE_DWORDX4(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_buffer_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_SMEM__S_BUFFER_STORE_DWORDX4
-
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::~Inst_SMEM__S_BUFFER_STORE_DWORDX4()
-    {
-    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
-    // the offset input.
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_SMEM__S_BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_SMEM__S_DCACHE_INV class methods ---
-
-    Inst_SMEM__S_DCACHE_INV::Inst_SMEM__S_DCACHE_INV(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_inv")
-    {
-    } // Inst_SMEM__S_DCACHE_INV
-
-    Inst_SMEM__S_DCACHE_INV::~Inst_SMEM__S_DCACHE_INV()
-    {
-    } // ~Inst_SMEM__S_DCACHE_INV
-
-    // --- description from .arch file ---
-    // Invalidate the scalar data cache.
-    void
-    Inst_SMEM__S_DCACHE_INV::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_DCACHE_WB class methods ---
-
-    Inst_SMEM__S_DCACHE_WB::Inst_SMEM__S_DCACHE_WB(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_wb")
-    {
-    } // Inst_SMEM__S_DCACHE_WB
-
-    Inst_SMEM__S_DCACHE_WB::~Inst_SMEM__S_DCACHE_WB()
-    {
-    } // ~Inst_SMEM__S_DCACHE_WB
-
-    // --- description from .arch file ---
-    // Write back dirty data in the scalar data cache.
-    void
-    Inst_SMEM__S_DCACHE_WB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_DCACHE_INV_VOL class methods ---
-
-    Inst_SMEM__S_DCACHE_INV_VOL::Inst_SMEM__S_DCACHE_INV_VOL(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_inv_vol")
-    {
-    } // Inst_SMEM__S_DCACHE_INV_VOL
-
-    Inst_SMEM__S_DCACHE_INV_VOL::~Inst_SMEM__S_DCACHE_INV_VOL()
-    {
-    } // ~Inst_SMEM__S_DCACHE_INV_VOL
-
-    // --- description from .arch file ---
-    // Invalidate the scalar data cache volatile lines.
-    void
-    Inst_SMEM__S_DCACHE_INV_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_DCACHE_WB_VOL class methods ---
-
-    Inst_SMEM__S_DCACHE_WB_VOL::Inst_SMEM__S_DCACHE_WB_VOL(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_dcache_wb_vol")
-    {
-    } // Inst_SMEM__S_DCACHE_WB_VOL
-
-    Inst_SMEM__S_DCACHE_WB_VOL::~Inst_SMEM__S_DCACHE_WB_VOL()
-    {
-    } // ~Inst_SMEM__S_DCACHE_WB_VOL
-
-    // --- description from .arch file ---
-    // Write back dirty data in the scalar data cache volatile lines.
-    void
-    Inst_SMEM__S_DCACHE_WB_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_MEMTIME class methods ---
-
-    Inst_SMEM__S_MEMTIME::Inst_SMEM__S_MEMTIME(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_memtime")
-    {
-        // s_memtime does not issue a memory request
-        setFlag(ALU);
-    } // Inst_SMEM__S_MEMTIME
-
-    Inst_SMEM__S_MEMTIME::~Inst_SMEM__S_MEMTIME()
-    {
-    } // ~Inst_SMEM__S_MEMTIME
-
-    // --- description from .arch file ---
-    // Return current 64-bit timestamp.
-    void
-    Inst_SMEM__S_MEMTIME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
-        sdst = (ScalarRegU64)gpuDynInst->computeUnit()->curCycle();
-        sdst.write();
-    } // execute
-    // --- Inst_SMEM__S_MEMREALTIME class methods ---
-
-    Inst_SMEM__S_MEMREALTIME::Inst_SMEM__S_MEMREALTIME(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_memrealtime")
-    {
-    } // Inst_SMEM__S_MEMREALTIME
-
-    Inst_SMEM__S_MEMREALTIME::~Inst_SMEM__S_MEMREALTIME()
-    {
-    } // ~Inst_SMEM__S_MEMREALTIME
-
-    // --- description from .arch file ---
-    // Return current 64-bit RTC.
-    void
-    Inst_SMEM__S_MEMREALTIME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_ATC_PROBE class methods ---
-
-    Inst_SMEM__S_ATC_PROBE::Inst_SMEM__S_ATC_PROBE(InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_atc_probe")
-    {
-    } // Inst_SMEM__S_ATC_PROBE
-
-    Inst_SMEM__S_ATC_PROBE::~Inst_SMEM__S_ATC_PROBE()
-    {
-    } // ~Inst_SMEM__S_ATC_PROBE
-
-    // --- description from .arch file ---
-    // Probe or prefetch an address into the SQC data cache.
-    void
-    Inst_SMEM__S_ATC_PROBE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_SMEM__S_ATC_PROBE_BUFFER class methods ---
-
-    Inst_SMEM__S_ATC_PROBE_BUFFER::Inst_SMEM__S_ATC_PROBE_BUFFER(
-          InFmt_SMEM *iFmt)
-        : Inst_SMEM(iFmt, "s_atc_probe_buffer")
-    {
-    } // Inst_SMEM__S_ATC_PROBE_BUFFER
-
-    Inst_SMEM__S_ATC_PROBE_BUFFER::~Inst_SMEM__S_ATC_PROBE_BUFFER()
-    {
-    } // ~Inst_SMEM__S_ATC_PROBE_BUFFER
-
-    // --- description from .arch file ---
-    // Probe or prefetch an address into the SQC data cache.
-    void
-    Inst_SMEM__S_ATC_PROBE_BUFFER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_CNDMASK_B32 class methods ---
-
-    Inst_VOP2__V_CNDMASK_B32::Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_cndmask_b32")
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_CNDMASK_B32
-
-    Inst_VOP2__V_CNDMASK_B32::~Inst_VOP2__V_CNDMASK_B32()
-    {
-    } // ~Inst_VOP2__V_CNDMASK_B32
-
-    // --- description from .arch file ---
-    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
-    // as a scalar GPR in S2.
-    void
-    Inst_VOP2__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ADD_F32 class methods ---
-
-    Inst_VOP2__V_ADD_F32::Inst_VOP2__V_ADD_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_ADD_F32
-
-    Inst_VOP2__V_ADD_F32::~Inst_VOP2__V_ADD_F32()
-    {
-    } // ~Inst_VOP2__V_ADD_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f + S1.f.
-    void
-    Inst_VOP2__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isDPPInst()) {
-            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src0_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_dpp[lane] + src1[lane];
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] + src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_F32 class methods ---
-
-    Inst_VOP2__V_SUB_F32::Inst_VOP2__V_SUB_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_SUB_F32
-
-    Inst_VOP2__V_SUB_F32::~Inst_VOP2__V_SUB_F32()
-    {
-    } // ~Inst_VOP2__V_SUB_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - S1.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP2__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_F32 class methods ---
-
-    Inst_VOP2__V_SUBREV_F32::Inst_VOP2__V_SUBREV_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_SUBREV_F32
-
-    Inst_VOP2__V_SUBREV_F32::~Inst_VOP2__V_SUBREV_F32()
-    {
-    } // ~Inst_VOP2__V_SUBREV_F32
-
-    // --- description from .arch file ---
-    // D.f = S1.f - S0.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP2__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_LEGACY_F32 class methods ---
-
-    Inst_VOP2__V_MUL_LEGACY_F32::Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MUL_LEGACY_F32
-
-    Inst_VOP2__V_MUL_LEGACY_F32::~Inst_VOP2__V_MUL_LEGACY_F32()
-    {
-    } // ~Inst_VOP2__V_MUL_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
-    void
-    Inst_VOP2__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_F32 class methods ---
-
-    Inst_VOP2__V_MUL_F32::Inst_VOP2__V_MUL_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MUL_F32
-
-    Inst_VOP2__V_MUL_F32::~Inst_VOP2__V_MUL_F32()
-    {
-    } // ~Inst_VOP2__V_MUL_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f.
-    void
-    Inst_VOP2__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_I32_I24 class methods ---
-
-    Inst_VOP2__V_MUL_I32_I24::Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_i32_i24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_I32_I24
-
-    Inst_VOP2__V_MUL_I32_I24::~Inst_VOP2__V_MUL_I32_I24()
-    {
-    } // ~Inst_VOP2__V_MUL_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = S0.i[23:0] * S1.i[23:0].
-    void
-    Inst_VOP2__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
-                    * sext<24>(bits(src1[lane], 23, 0));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_HI_I32_I24 class methods ---
-
-    Inst_VOP2__V_MUL_HI_I32_I24::Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_HI_I32_I24
-
-    Inst_VOP2__V_MUL_HI_I32_I24::~Inst_VOP2__V_MUL_HI_I32_I24()
-    {
-    } // ~Inst_VOP2__V_MUL_HI_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
-    void
-    Inst_VOP2__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 tmp_src0
-                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
-                VecElemI64 tmp_src1
-                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
-
-                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_U32_U24 class methods ---
-
-    Inst_VOP2__V_MUL_U32_U24::Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_u32_u24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_U32_U24
-
-    Inst_VOP2__V_MUL_U32_U24::~Inst_VOP2__V_MUL_U32_U24()
-    {
-    } // ~Inst_VOP2__V_MUL_U32_U24
-
-    // --- description from .arch file ---
-    // D.u = S0.u[23:0] * S1.u[23:0].
-    void
-    Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,
-                         VecOperandU32& vdst, Wavefront* wf) {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = bits(src0[lane], 23, 0) *
-                                 bits(src1[lane], 23, 0);
-                }
-            }
-        };
-
-        vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);
-    } // execute
-    // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---
-
-    Inst_VOP2__V_MUL_HI_U32_U24::Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_HI_U32_U24
-
-    Inst_VOP2__V_MUL_HI_U32_U24::~Inst_VOP2__V_MUL_HI_U32_U24()
-    {
-    } // ~Inst_VOP2__V_MUL_HI_U32_U24
-
-    // --- description from .arch file ---
-    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
-    void
-    Inst_VOP2__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
-                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
-                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_F32 class methods ---
-
-    Inst_VOP2__V_MIN_F32::Inst_VOP2__V_MIN_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MIN_F32
-
-    Inst_VOP2__V_MIN_F32::~Inst_VOP2__V_MIN_F32()
-    {
-    } // ~Inst_VOP2__V_MIN_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f < S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP2__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_F32 class methods ---
-
-    Inst_VOP2__V_MAX_F32::Inst_VOP2__V_MAX_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP2__V_MAX_F32
-
-    Inst_VOP2__V_MAX_F32::~Inst_VOP2__V_MAX_F32()
-    {
-    } // ~Inst_VOP2__V_MAX_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP2__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_I32 class methods ---
-
-    Inst_VOP2__V_MIN_I32::Inst_VOP2__V_MIN_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_I32
-
-    Inst_VOP2__V_MIN_I32::~Inst_VOP2__V_MIN_I32()
-    {
-    } // ~Inst_VOP2__V_MIN_I32
-
-    // --- description from .arch file ---
-    // D.i = min(S0.i, S1.i).
-    void
-    Inst_VOP2__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_I32 class methods ---
-
-    Inst_VOP2__V_MAX_I32::Inst_VOP2__V_MAX_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_I32
-
-    Inst_VOP2__V_MAX_I32::~Inst_VOP2__V_MAX_I32()
-    {
-    } // ~Inst_VOP2__V_MAX_I32
-
-    // --- description from .arch file ---
-    // D.i = max(S0.i, S1.i).
-    void
-    Inst_VOP2__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_U32 class methods ---
-
-    Inst_VOP2__V_MIN_U32::Inst_VOP2__V_MIN_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_U32
-
-    Inst_VOP2__V_MIN_U32::~Inst_VOP2__V_MIN_U32()
-    {
-    } // ~Inst_VOP2__V_MIN_U32
-
-    // --- description from .arch file ---
-    // D.u = min(S0.u, S1.u).
-    void
-    Inst_VOP2__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_U32 class methods ---
-
-    Inst_VOP2__V_MAX_U32::Inst_VOP2__V_MAX_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_U32
-
-    Inst_VOP2__V_MAX_U32::~Inst_VOP2__V_MAX_U32()
-    {
-    } // ~Inst_VOP2__V_MAX_U32
-
-    // --- description from .arch file ---
-    // D.u = max(S0.u, S1.u).
-    void
-    Inst_VOP2__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHRREV_B32 class methods ---
-
-    Inst_VOP2__V_LSHRREV_B32::Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshrrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHRREV_B32
-
-    Inst_VOP2__V_LSHRREV_B32::~Inst_VOP2__V_LSHRREV_B32()
-    {
-    } // ~Inst_VOP2__V_LSHRREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u >> S0.u[4:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ASHRREV_I32 class methods ---
-
-    Inst_VOP2__V_ASHRREV_I32::Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ashrrev_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ASHRREV_I32
-
-    Inst_VOP2__V_ASHRREV_I32::~Inst_VOP2__V_ASHRREV_I32()
-    {
-    } // ~Inst_VOP2__V_ASHRREV_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(S1.i) >> S0.i[4:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHLREV_B32 class methods ---
-
-    Inst_VOP2__V_LSHLREV_B32::Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshlrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHLREV_B32
-
-    Inst_VOP2__V_LSHLREV_B32::~Inst_VOP2__V_LSHLREV_B32()
-    {
-    } // ~Inst_VOP2__V_LSHLREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u << S0.u[4:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and vdst during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
-                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "
-                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
-                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_AND_B32 class methods ---
-
-    Inst_VOP2__V_AND_B32::Inst_VOP2__V_AND_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_and_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_AND_B32
-
-    Inst_VOP2__V_AND_B32::~Inst_VOP2__V_AND_B32()
-    {
-    } // ~Inst_VOP2__V_AND_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isDPPInst()) {
-            VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src0_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_AND_B32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_dpp[lane] & src1[lane];
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] & src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_OR_B32 class methods ---
-
-    Inst_VOP2__V_OR_B32::Inst_VOP2__V_OR_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_or_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_OR_B32
-
-    Inst_VOP2__V_OR_B32::~Inst_VOP2__V_OR_B32()
-    {
-    } // ~Inst_VOP2__V_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
-                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
-                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
-                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_sdwa[lane] | src1[lane];
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] | src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_XOR_B32 class methods ---
-
-    Inst_VOP2__V_XOR_B32::Inst_VOP2__V_XOR_B32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_xor_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_XOR_B32
-
-    Inst_VOP2__V_XOR_B32::~Inst_VOP2__V_XOR_B32()
-    {
-    } // ~Inst_VOP2__V_XOR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u ^ S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP2__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] ^ src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAC_F32 class methods ---
-
-    Inst_VOP2__V_MAC_F32::Inst_VOP2__V_MAC_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mac_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAC);
-    } // Inst_VOP2__V_MAC_F32
-
-    Inst_VOP2__V_MAC_F32::~Inst_VOP2__V_MAC_F32()
-    {
-    } // ~Inst_VOP2__V_MAC_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + D.f.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP2__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-        vdst.read();
-
-        if (isDPPInst()) {
-            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src0_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
-                                          vdst[lane]);
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MADMK_F32 class methods ---
-
-    Inst_VOP2__V_MADMK_F32::Inst_VOP2__V_MADMK_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madmk_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADMK_F32
-
-    Inst_VOP2__V_MADMK_F32::~Inst_VOP2__V_MADMK_F32()
-    {
-    } // ~Inst_VOP2__V_MADMK_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // ---  modifiers.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP2__V_MADMK_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-        VecElemF32 k = extData.imm_f32;
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], k, src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MADAK_F32 class methods ---
-
-    Inst_VOP2__V_MADAK_F32::Inst_VOP2__V_MADAK_F32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madak_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADAK_F32
-
-    Inst_VOP2__V_MADAK_F32::~Inst_VOP2__V_MADAK_F32()
-    {
-    } // ~Inst_VOP2__V_MADAK_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // ---  modifiers.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP2__V_MADAK_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-        VecElemF32 k = extData.imm_f32;
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], k);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ADD_CO_U32 class methods ---
-
-    Inst_VOP2__V_ADD_CO_U32::Inst_VOP2__V_ADD_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_ADD_CO_U32
-
-    Inst_VOP2__V_ADD_CO_U32::~Inst_VOP2__V_ADD_CO_U32()
-    {
-    } // ~Inst_VOP2__V_ADD_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
-    // ---  overflow or carry-out for V_ADDC_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_ADD_CO_U32 SRC SDWA. SRC0: register "
-                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
-                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
-                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_sdwa[lane] + src1[lane];
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
-                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] + src1[lane];
-                    vcc.setBit(lane, ((VecElemU64)src0[lane]
-                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
-                }
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUB_CO_U32::Inst_VOP2__V_SUB_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_SUB_CO_U32
-
-    Inst_VOP2__V_SUB_CO_U32::~Inst_VOP2__V_SUB_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUBREV_CO_U32::Inst_VOP2__V_SUBREV_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP2__V_SUBREV_CO_U32
-
-    Inst_VOP2__V_SUBREV_CO_U32::~Inst_VOP2__V_SUBREV_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u;
-    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP2__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_ADDC_CO_U32 class methods ---
-
-    Inst_VOP2__V_ADDC_CO_U32::Inst_VOP2__V_ADDC_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_addc_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_ADDC_CO_U32
-
-    Inst_VOP2__V_ADDC_CO_U32::~Inst_VOP2__V_ADDC_CO_U32()
-    {
-    } // ~Inst_VOP2__V_ADDC_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + VCC[threadId];
-    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
-    // is an UNSIGNED overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP2__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane]
-                    + bits(vcc.rawData(), lane);
-                vcc.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]
-                        + (VecElemU64)bits(vcc.rawData(), lane, lane))
-                            >= 0x100000000 ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBB_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUBB_CO_U32::Inst_VOP2__V_SUBB_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subb_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_SUBB_CO_U32
-
-    Inst_VOP2__V_SUBB_CO_U32::~Inst_VOP2__V_SUBB_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUBB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // ---  overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // ---  source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP2__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);
-                vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBBREV_CO_U32 class methods ---
-
-    Inst_VOP2__V_SUBBREV_CO_U32::Inst_VOP2__V_SUBBREV_CO_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subbrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP2__V_SUBBREV_CO_U32
-
-    Inst_VOP2__V_SUBBREV_CO_U32::~Inst_VOP2__V_SUBBREV_CO_U32()
-    {
-    } // ~Inst_VOP2__V_SUBBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
-    // SQ translates this to V_SUBREV_U32 with reversed operands.
-    void
-    Inst_VOP2__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-        vcc.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);
-                vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))
-                    > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP2__V_ADD_F16 class methods ---
-
-    Inst_VOP2__V_ADD_F16::Inst_VOP2__V_ADD_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_ADD_F16
-
-    Inst_VOP2__V_ADD_F16::~Inst_VOP2__V_ADD_F16()
-    {
-    } // ~Inst_VOP2__V_ADD_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP2__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_SUB_F16 class methods ---
-
-    Inst_VOP2__V_SUB_F16::Inst_VOP2__V_SUB_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_SUB_F16
-
-    Inst_VOP2__V_SUB_F16::~Inst_VOP2__V_SUB_F16()
-    {
-    } // ~Inst_VOP2__V_SUB_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 - S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP2__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_F16 class methods ---
-
-    Inst_VOP2__V_SUBREV_F16::Inst_VOP2__V_SUBREV_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_SUBREV_F16
-
-    Inst_VOP2__V_SUBREV_F16::~Inst_VOP2__V_SUBREV_F16()
-    {
-    } // ~Inst_VOP2__V_SUBREV_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S1.f16 - S0.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP2__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MUL_F16 class methods ---
-
-    Inst_VOP2__V_MUL_F16::Inst_VOP2__V_MUL_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MUL_F16
-
-    Inst_VOP2__V_MUL_F16::~Inst_VOP2__V_MUL_F16()
-    {
-    } // ~Inst_VOP2__V_MUL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP2__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MAC_F16 class methods ---
-
-    Inst_VOP2__V_MAC_F16::Inst_VOP2__V_MAC_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mac_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAC);
-    } // Inst_VOP2__V_MAC_F16
-
-    Inst_VOP2__V_MAC_F16::~Inst_VOP2__V_MAC_F16()
-    {
-    } // ~Inst_VOP2__V_MAC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + D.f16.
-    // Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP2__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MADMK_F16 class methods ---
-
-    Inst_VOP2__V_MADMK_F16::Inst_VOP2__V_MADMK_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madmk_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADMK_F16
-
-    Inst_VOP2__V_MADMK_F16::~Inst_VOP2__V_MADMK_F16()
-    {
-    } // ~Inst_VOP2__V_MADMK_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored
-    // in the following literal DWORD.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // modifiers. Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP2__V_MADMK_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MADAK_F16 class methods ---
-
-    Inst_VOP2__V_MADAK_F16::Inst_VOP2__V_MADAK_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_madak_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP2__V_MADAK_F16
-
-    Inst_VOP2__V_MADAK_F16::~Inst_VOP2__V_MADAK_F16()
-    {
-    } // ~Inst_VOP2__V_MADAK_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored
-    // in the following literal DWORD.
-    // This opcode cannot use the VOP3 encoding and cannot use input/output
-    // modifiers. Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP2__V_MADAK_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_ADD_U16 class methods ---
-
-    Inst_VOP2__V_ADD_U16::Inst_VOP2__V_ADD_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ADD_U16
-
-    Inst_VOP2__V_ADD_U16::~Inst_VOP2__V_ADD_U16()
-    {
-    } // ~Inst_VOP2__V_ADD_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 + S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP2__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_U16 class methods ---
-
-    Inst_VOP2__V_SUB_U16::Inst_VOP2__V_SUB_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUB_U16
-
-    Inst_VOP2__V_SUB_U16::~Inst_VOP2__V_SUB_U16()
-    {
-    } // ~Inst_VOP2__V_SUB_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 - S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP2__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_U16 class methods ---
-
-    Inst_VOP2__V_SUBREV_U16::Inst_VOP2__V_SUBREV_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUBREV_U16
-
-    Inst_VOP2__V_SUBREV_U16::~Inst_VOP2__V_SUBREV_U16()
-    {
-    } // ~Inst_VOP2__V_SUBREV_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S1.u16 - S0.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    // SQ translates this to V_SUB_U16 with reversed operands.
-    void
-    Inst_VOP2__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MUL_LO_U16 class methods ---
-
-    Inst_VOP2__V_MUL_LO_U16::Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_mul_lo_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MUL_LO_U16
-
-    Inst_VOP2__V_MUL_LO_U16::~Inst_VOP2__V_MUL_LO_U16()
-    {
-    } // ~Inst_VOP2__V_MUL_LO_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 * S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP2__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHLREV_B16 class methods ---
-
-    Inst_VOP2__V_LSHLREV_B16::Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshlrev_b16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHLREV_B16
-
-    Inst_VOP2__V_LSHLREV_B16::~Inst_VOP2__V_LSHLREV_B16()
-    {
-    } // ~Inst_VOP2__V_LSHLREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LSHRREV_B16 class methods ---
-
-    Inst_VOP2__V_LSHRREV_B16::Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_lshrrev_b16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_LSHRREV_B16
-
-    Inst_VOP2__V_LSHRREV_B16::~Inst_VOP2__V_LSHRREV_B16()
-    {
-    } // ~Inst_VOP2__V_LSHRREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_ASHRREV_I16 class methods ---
-
-    Inst_VOP2__V_ASHRREV_I16::Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ashrrev_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ASHRREV_I16
-
-    Inst_VOP2__V_ASHRREV_I16::~Inst_VOP2__V_ASHRREV_I16()
-    {
-    } // ~Inst_VOP2__V_ASHRREV_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP2__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_F16 class methods ---
-
-    Inst_VOP2__V_MAX_F16::Inst_VOP2__V_MAX_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MAX_F16
-
-    Inst_VOP2__V_MAX_F16::~Inst_VOP2__V_MAX_F16()
-    {
-    } // ~Inst_VOP2__V_MAX_F16
-
-    // --- description from .arch file ---
-    // D.f16 = max(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP2__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MIN_F16 class methods ---
-
-    Inst_VOP2__V_MIN_F16::Inst_VOP2__V_MIN_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_MIN_F16
-
-    Inst_VOP2__V_MIN_F16::~Inst_VOP2__V_MIN_F16()
-    {
-    } // ~Inst_VOP2__V_MIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = min(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP2__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_MAX_U16 class methods ---
-
-    Inst_VOP2__V_MAX_U16::Inst_VOP2__V_MAX_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_U16
-
-    Inst_VOP2__V_MAX_U16::~Inst_VOP2__V_MAX_U16()
-    {
-    } // ~Inst_VOP2__V_MAX_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP2__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MAX_I16 class methods ---
-
-    Inst_VOP2__V_MAX_I16::Inst_VOP2__V_MAX_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_max_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MAX_I16
-
-    Inst_VOP2__V_MAX_I16::~Inst_VOP2__V_MAX_I16()
-    {
-    } // ~Inst_VOP2__V_MAX_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP2__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_U16 class methods ---
-
-    Inst_VOP2__V_MIN_U16::Inst_VOP2__V_MIN_U16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_U16
-
-    Inst_VOP2__V_MIN_U16::~Inst_VOP2__V_MIN_U16()
-    {
-    } // ~Inst_VOP2__V_MIN_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP2__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_MIN_I16 class methods ---
-
-    Inst_VOP2__V_MIN_I16::Inst_VOP2__V_MIN_I16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_min_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_MIN_I16
-
-    Inst_VOP2__V_MIN_I16::~Inst_VOP2__V_MIN_I16()
-    {
-    } // ~Inst_VOP2__V_MIN_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP2__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_LDEXP_F16 class methods ---
-
-    Inst_VOP2__V_LDEXP_F16::Inst_VOP2__V_LDEXP_F16(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_ldexp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP2__V_LDEXP_F16
-
-    Inst_VOP2__V_LDEXP_F16::~Inst_VOP2__V_LDEXP_F16()
-    {
-    } // ~Inst_VOP2__V_LDEXP_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * (2 ** S1.i16).
-    void
-    Inst_VOP2__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP2__V_ADD_U32 class methods ---
-
-    Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_ADD_U32
-
-    Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()
-    {
-    } // ~Inst_VOP2__V_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    void
-    Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
-                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
-                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
-                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0_sdwa[lane] + src1[lane];
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src0[lane] + src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUB_U32 class methods ---
-
-    Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUB_U32
-
-    Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()
-    {
-    } // ~Inst_VOP2__V_SUB_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    void
-    Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP2__V_SUBREV_U32 class methods ---
-
-    Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP2__V_SUBREV_U32
-
-    Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()
-    {
-    } // ~Inst_VOP2__V_SUBREV_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u;
-    void
-    Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_NOP class methods ---
-
-    Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_nop")
-    {
-        setFlag(Nop);
-        setFlag(ALU);
-    } // Inst_VOP1__V_NOP
-
-    Inst_VOP1__V_NOP::~Inst_VOP1__V_NOP()
-    {
-    } // ~Inst_VOP1__V_NOP
-
-    // --- description from .arch file ---
-    // Do nothing.
-    void
-    Inst_VOP1__V_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_VOP1__V_MOV_B32 class methods ---
-
-    Inst_VOP1__V_MOV_B32::Inst_VOP1__V_MOV_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_mov_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_MOV_B32
-
-    Inst_VOP1__V_MOV_B32::~Inst_VOP1__V_MOV_B32()
-    {
-    } // ~Inst_VOP1__V_MOV_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (isDPPInst()) {
-            VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
-            src_dpp.read();
-
-            DPRINTF(VEGA, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], "
-                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
-                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
-                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
-                    extData.iFmt_VOP_DPP.DPP_CTRL,
-                    extData.iFmt_VOP_DPP.SRC0_ABS,
-                    extData.iFmt_VOP_DPP.SRC0_NEG,
-                    extData.iFmt_VOP_DPP.SRC1_ABS,
-                    extData.iFmt_VOP_DPP.SRC1_NEG,
-                    extData.iFmt_VOP_DPP.BC,
-                    extData.iFmt_VOP_DPP.BANK_MASK,
-                    extData.iFmt_VOP_DPP.ROW_MASK);
-
-            // NOTE: For VOP1, there is no SRC1, so make sure we're not trying
-            // to negate it or take the absolute value of it
-            assert(!extData.iFmt_VOP_DPP.SRC1_ABS);
-            assert(!extData.iFmt_VOP_DPP.SRC1_NEG);
-            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src_dpp);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src_dpp[lane];
-                }
-            }
-        } else {
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_READFIRSTLANE_B32 class methods ---
-
-    Inst_VOP1__V_READFIRSTLANE_B32::Inst_VOP1__V_READFIRSTLANE_B32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_readfirstlane_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_READFIRSTLANE_B32
-
-    Inst_VOP1__V_READFIRSTLANE_B32::~Inst_VOP1__V_READFIRSTLANE_B32()
-    {
-    } // ~Inst_VOP1__V_READFIRSTLANE_B32
-
-    // --- description from .arch file ---
-    // Copy one VGPR value to one SGPR. D = SGPR destination, S0 = source data
-    // (VGPR# or M0 for lds direct access), Lane# = FindFirst1fromLSB(exec)
-    // (Lane# = 0 if exec is zero). Ignores exec mask for the access. SQ
-    // translates to V_READLANE_B32.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_READFIRSTLANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarRegI32 src_lane(0);
-        ScalarRegU64 exec_mask = wf->execMask().to_ullong();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (exec_mask) {
-            src_lane = findLsbSet(exec_mask);
-        }
-
-        sdst = src[src_lane];
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_I32_F64 class methods ---
-
-    Inst_VOP1__V_CVT_I32_F64::Inst_VOP1__V_CVT_I32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_I32_F64
-
-    Inst_VOP1__V_CVT_I32_F64::~Inst_VOP1__V_CVT_I32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_I32_F64
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F64_I32 class methods ---
-
-    Inst_VOP1__V_CVT_F64_I32::Inst_VOP1__V_CVT_F64_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_i32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_I32
-
-    Inst_VOP1__V_CVT_F64_I32::~Inst_VOP1__V_CVT_F64_I32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_I32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.i.
-    void
-    Inst_VOP1__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_I32 class methods ---
-
-    Inst_VOP1__V_CVT_F32_I32::Inst_VOP1__V_CVT_F32_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_i32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_I32
-
-    Inst_VOP1__V_CVT_F32_I32::~Inst_VOP1__V_CVT_F32_I32()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_I32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.i.
-    void
-    Inst_VOP1__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_U32 class methods ---
-
-    Inst_VOP1__V_CVT_F32_U32::Inst_VOP1__V_CVT_F32_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_u32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_U32
-
-    Inst_VOP1__V_CVT_F32_U32::~Inst_VOP1__V_CVT_F32_U32()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_U32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.u.
-    void
-    Inst_VOP1__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_U32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_U32_F32::Inst_VOP1__V_CVT_U32_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_U32_F32
-
-    Inst_VOP1__V_CVT_U32_F32::~Inst_VOP1__V_CVT_U32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_U32_F32
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_I32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_I32_F32::Inst_VOP1__V_CVT_I32_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_I32_F32
-
-    Inst_VOP1__V_CVT_I32_F32::~Inst_VOP1__V_CVT_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_MOV_FED_B32 class methods ---
-
-    Inst_VOP1__V_MOV_FED_B32::Inst_VOP1__V_MOV_FED_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_mov_fed_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_MOV_FED_B32
-
-    Inst_VOP1__V_MOV_FED_B32::~Inst_VOP1__V_MOV_FED_B32()
-    {
-    } // ~Inst_VOP1__V_MOV_FED_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u;
-    // Introduce EDC double error upon write to dest vgpr without causing an
-    // ---  exception.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP1__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F16_F32 class methods ---
-
-    Inst_VOP1__V_CVT_F16_F32::Inst_VOP1__V_CVT_F16_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F16_F32
-
-    Inst_VOP1__V_CVT_F16_F32::~Inst_VOP1__V_CVT_F16_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_F32
-
-    // --- description from .arch file ---
-    // D.f16 = flt32_to_flt16(S0.f).
-    // Supports input modifiers and creates FP16 denormals when appropriate.
-    void
-    Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_F16 class methods ---
-
-    Inst_VOP1__V_CVT_F32_F16::Inst_VOP1__V_CVT_F32_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_f16")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_F16
-
-    Inst_VOP1__V_CVT_F32_F16::~Inst_VOP1__V_CVT_F32_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_F16
-
-    // --- description from .arch file ---
-    // D.f = flt16_to_flt32(S0.f16).
-    // FP16 denormal inputs are always accepted.
-    void
-    Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_RPI_I32_F32::Inst_VOP1__V_CVT_RPI_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_rpi_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_RPI_I32_F32
-
-    Inst_VOP1__V_CVT_RPI_I32_F32::~Inst_VOP1__V_CVT_RPI_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_RPI_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f + 0.5).
-    void
-    Inst_VOP1__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_FLR_I32_F32 class methods ---
-
-    Inst_VOP1__V_CVT_FLR_I32_F32::Inst_VOP1__V_CVT_FLR_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_flr_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_FLR_I32_F32
-
-    Inst_VOP1__V_CVT_FLR_I32_F32::~Inst_VOP1__V_CVT_FLR_I32_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_FLR_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f).
-    void
-    Inst_VOP1__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_OFF_F32_I4 class methods ---
-
-    Inst_VOP1__V_CVT_OFF_F32_I4::Inst_VOP1__V_CVT_OFF_F32_I4(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_off_f32_i4")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_OFF_F32_I4
-
-    Inst_VOP1__V_CVT_OFF_F32_I4::~Inst_VOP1__V_CVT_OFF_F32_I4()
-    {
-    } // ~Inst_VOP1__V_CVT_OFF_F32_I4
-
-    // --- description from .arch file ---
-    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
-    void
-    Inst_VOP1__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // Could not parse sq_uc.arch desc field
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_F64 class methods ---
-
-    Inst_VOP1__V_CVT_F32_F64::Inst_VOP1__V_CVT_F32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F32_F64
-
-    Inst_VOP1__V_CVT_F32_F64::~Inst_VOP1__V_CVT_F32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_F64
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.d.
-    void
-    Inst_VOP1__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F64_F32 class methods ---
-
-    Inst_VOP1__V_CVT_F64_F32::Inst_VOP1__V_CVT_F64_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_f32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_F32
-
-    Inst_VOP1__V_CVT_F64_F32::~Inst_VOP1__V_CVT_F64_F32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_F32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.f.
-    void
-    Inst_VOP1__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE0 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE0::Inst_VOP1__V_CVT_F32_UBYTE0(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte0")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE0
-
-    Inst_VOP1__V_CVT_F32_UBYTE0::~Inst_VOP1__V_CVT_F32_UBYTE0()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE0
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[7:0]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE1 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE1::Inst_VOP1__V_CVT_F32_UBYTE1(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte1")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE1
-
-    Inst_VOP1__V_CVT_F32_UBYTE1::~Inst_VOP1__V_CVT_F32_UBYTE1()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE1
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[15:8]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE2 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE2::Inst_VOP1__V_CVT_F32_UBYTE2(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte2")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE2
-
-    Inst_VOP1__V_CVT_F32_UBYTE2::~Inst_VOP1__V_CVT_F32_UBYTE2()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE2
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[23:16]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F32_UBYTE3 class methods ---
-
-    Inst_VOP1__V_CVT_F32_UBYTE3::Inst_VOP1__V_CVT_F32_UBYTE3(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte3")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CVT_F32_UBYTE3
-
-    Inst_VOP1__V_CVT_F32_UBYTE3::~Inst_VOP1__V_CVT_F32_UBYTE3()
-    {
-    } // ~Inst_VOP1__V_CVT_F32_UBYTE3
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[31:24]).
-    void
-    Inst_VOP1__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_U32_F64 class methods ---
-
-    Inst_VOP1__V_CVT_U32_F64::Inst_VOP1__V_CVT_U32_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_U32_F64
-
-    Inst_VOP1__V_CVT_U32_F64::~Inst_VOP1__V_CVT_U32_F64()
-    {
-    } // ~Inst_VOP1__V_CVT_U32_F64
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP1__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F64_U32 class methods ---
-
-    Inst_VOP1__V_CVT_F64_U32::Inst_VOP1__V_CVT_F64_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f64_u32")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CVT_F64_U32
-
-    Inst_VOP1__V_CVT_F64_U32::~Inst_VOP1__V_CVT_F64_U32()
-    {
-    } // ~Inst_VOP1__V_CVT_F64_U32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.u.
-    void
-    Inst_VOP1__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_TRUNC_F64 class methods ---
-
-    Inst_VOP1__V_TRUNC_F64::Inst_VOP1__V_TRUNC_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_TRUNC_F64
-
-    Inst_VOP1__V_TRUNC_F64::~Inst_VOP1__V_TRUNC_F64()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d), return integer part of S0.d.
-    void
-    Inst_VOP1__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CEIL_F64 class methods ---
-
-    Inst_VOP1__V_CEIL_F64::Inst_VOP1__V_CEIL_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_CEIL_F64
-
-    Inst_VOP1__V_CEIL_F64::~Inst_VOP1__V_CEIL_F64()
-    {
-    } // ~Inst_VOP1__V_CEIL_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
-    void
-    Inst_VOP1__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RNDNE_F64 class methods ---
-
-    Inst_VOP1__V_RNDNE_F64::Inst_VOP1__V_RNDNE_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RNDNE_F64
-
-    Inst_VOP1__V_RNDNE_F64::~Inst_VOP1__V_RNDNE_F64()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F64
-
-    // --- description from .arch file ---
-    // D.d = round_nearest_even(S0.d).
-    void
-    Inst_VOP1__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FLOOR_F64 class methods ---
-
-    Inst_VOP1__V_FLOOR_F64::Inst_VOP1__V_FLOOR_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FLOOR_F64
-
-    Inst_VOP1__V_FLOOR_F64::~Inst_VOP1__V_FLOOR_F64()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
-    void
-    Inst_VOP1__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FRACT_F32 class methods ---
-
-    Inst_VOP1__V_FRACT_F32::Inst_VOP1__V_FRACT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FRACT_F32
-
-    Inst_VOP1__V_FRACT_F32::~Inst_VOP1__V_FRACT_F32()
-    {
-    } // ~Inst_VOP1__V_FRACT_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - floor(S0.f).
-    void
-    Inst_VOP1__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_TRUNC_F32 class methods ---
-
-    Inst_VOP1__V_TRUNC_F32::Inst_VOP1__V_TRUNC_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_TRUNC_F32
-
-    Inst_VOP1__V_TRUNC_F32::~Inst_VOP1__V_TRUNC_F32()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f), return integer part of S0.f.
-    void
-    Inst_VOP1__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst (gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CEIL_F32 class methods ---
-
-    Inst_VOP1__V_CEIL_F32::Inst_VOP1__V_CEIL_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_CEIL_F32
-
-    Inst_VOP1__V_CEIL_F32::~Inst_VOP1__V_CEIL_F32()
-    {
-    } // ~Inst_VOP1__V_CEIL_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
-    void
-    Inst_VOP1__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RNDNE_F32 class methods ---
-
-    Inst_VOP1__V_RNDNE_F32::Inst_VOP1__V_RNDNE_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RNDNE_F32
-
-    Inst_VOP1__V_RNDNE_F32::~Inst_VOP1__V_RNDNE_F32()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F32
-
-    // --- description from .arch file ---
-    // D.f = round_nearest_even(S0.f).
-    void
-    Inst_VOP1__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FLOOR_F32 class methods ---
-
-    Inst_VOP1__V_FLOOR_F32::Inst_VOP1__V_FLOOR_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FLOOR_F32
-
-    Inst_VOP1__V_FLOOR_F32::~Inst_VOP1__V_FLOOR_F32()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
-    void
-    Inst_VOP1__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_EXP_F32 class methods ---
-
-    Inst_VOP1__V_EXP_F32::Inst_VOP1__V_EXP_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_EXP_F32
-
-    Inst_VOP1__V_EXP_F32::~Inst_VOP1__V_EXP_F32()
-    {
-    } // ~Inst_VOP1__V_EXP_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f).
-    void
-    Inst_VOP1__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_LOG_F32 class methods ---
-
-    Inst_VOP1__V_LOG_F32::Inst_VOP1__V_LOG_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_LOG_F32
-
-    Inst_VOP1__V_LOG_F32::~Inst_VOP1__V_LOG_F32()
-    {
-    } // ~Inst_VOP1__V_LOG_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm.
-    void
-    Inst_VOP1__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RCP_F32 class methods ---
-
-    Inst_VOP1__V_RCP_F32::Inst_VOP1__V_RCP_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RCP_F32
-
-    Inst_VOP1__V_RCP_F32::~Inst_VOP1__V_RCP_F32()
-    {
-    } // ~Inst_VOP1__V_RCP_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
-    void
-    Inst_VOP1__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RCP_IFLAG_F32 class methods ---
-
-    Inst_VOP1__V_RCP_IFLAG_F32::Inst_VOP1__V_RCP_IFLAG_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_iflag_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RCP_IFLAG_F32
-
-    Inst_VOP1__V_RCP_IFLAG_F32::~Inst_VOP1__V_RCP_IFLAG_F32()
-    {
-    } // ~Inst_VOP1__V_RCP_IFLAG_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
-    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
-    // ---  exceptions.
-    void
-    Inst_VOP1__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RSQ_F32 class methods ---
-
-    Inst_VOP1__V_RSQ_F32::Inst_VOP1__V_RSQ_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_RSQ_F32
-
-    Inst_VOP1__V_RSQ_F32::~Inst_VOP1__V_RSQ_F32()
-    {
-    } // ~Inst_VOP1__V_RSQ_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
-    void
-    Inst_VOP1__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RCP_F64 class methods ---
-
-    Inst_VOP1__V_RCP_F64::Inst_VOP1__V_RCP_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RCP_F64
-
-    Inst_VOP1__V_RCP_F64::~Inst_VOP1__V_RCP_F64()
-    {
-    } // ~Inst_VOP1__V_RCP_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / S0.d.
-    void
-    Inst_VOP1__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = 0.0;
-                    }
-                } else {
-                    vdst[lane] = 1.0 / src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_RSQ_F64 class methods ---
-
-    Inst_VOP1__V_RSQ_F64::Inst_VOP1__V_RSQ_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_RSQ_F64
-
-    Inst_VOP1__V_RSQ_F64::~Inst_VOP1__V_RSQ_F64()
-    {
-    } // ~Inst_VOP1__V_RSQ_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
-    void
-    Inst_VOP1__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])
-                           && !std::signbit(src[lane])) {
-                    vdst[lane] = 0.0;
-                } else if (std::signbit(src[lane])) {
-                    vdst[lane] = NAN;
-                } else {
-                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_SQRT_F32 class methods ---
-
-    Inst_VOP1__V_SQRT_F32::Inst_VOP1__V_SQRT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_SQRT_F32
-
-    Inst_VOP1__V_SQRT_F32::~Inst_VOP1__V_SQRT_F32()
-    {
-    } // ~Inst_VOP1__V_SQRT_F32
-
-    // --- description from .arch file ---
-    // D.f = sqrt(S0.f).
-    void
-    Inst_VOP1__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_SQRT_F64 class methods ---
-
-    Inst_VOP1__V_SQRT_F64::Inst_VOP1__V_SQRT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_SQRT_F64
-
-    Inst_VOP1__V_SQRT_F64::~Inst_VOP1__V_SQRT_F64()
-    {
-    } // ~Inst_VOP1__V_SQRT_F64
-
-    // --- description from .arch file ---
-    // D.d = sqrt(S0.d).
-    void
-    Inst_VOP1__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_SIN_F32 class methods ---
-
-    Inst_VOP1__V_SIN_F32::Inst_VOP1__V_SIN_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sin_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_SIN_F32
-
-    Inst_VOP1__V_SIN_F32::~Inst_VOP1__V_SIN_F32()
-    {
-    } // ~Inst_VOP1__V_SIN_F32
-
-    // --- description from .arch file ---
-    // D.f = sin(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 0.0.
-    void
-    Inst_VOP1__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (src[lane] < -256.0 || src[lane] > 256.0) {
-                    vdst[lane] = 0.0;
-                } else {
-                    vdst[lane] = std::sin(src[lane] * 2.0 * pi.rawData());
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_COS_F32 class methods ---
-
-    Inst_VOP1__V_COS_F32::Inst_VOP1__V_COS_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cos_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_COS_F32
-
-    Inst_VOP1__V_COS_F32::~Inst_VOP1__V_COS_F32()
-    {
-    } // ~Inst_VOP1__V_COS_F32
-
-    // --- description from .arch file ---
-    // D.f = cos(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 1.0.
-    void
-    Inst_VOP1__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (src[lane] < -256.0 || src[lane] > 256.0) {
-                    vdst[lane] = 0.0;
-                } else {
-                    vdst[lane] = std::cos(src[lane] * 2.0 * pi.rawData());
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_NOT_B32 class methods ---
-
-    Inst_VOP1__V_NOT_B32::Inst_VOP1__V_NOT_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_not_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_NOT_B32
-
-    Inst_VOP1__V_NOT_B32::~Inst_VOP1__V_NOT_B32()
-    {
-    } // ~Inst_VOP1__V_NOT_B32
-
-    // --- description from .arch file ---
-    // D.u = ~S0.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP1__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ~src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_BFREV_B32 class methods ---
-
-    Inst_VOP1__V_BFREV_B32::Inst_VOP1__V_BFREV_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_bfrev_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_BFREV_B32
-
-    Inst_VOP1__V_BFREV_B32::~Inst_VOP1__V_BFREV_B32()
-    {
-    } // ~Inst_VOP1__V_BFREV_B32
-
-    // --- description from .arch file ---
-    // D.u[31:0] = S0.u[0:31], bitfield reverse.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP1__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = reverseBits(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FFBH_U32 class methods ---
-
-    Inst_VOP1__V_FFBH_U32::Inst_VOP1__V_FFBH_U32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbh_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBH_U32
-
-    Inst_VOP1__V_FFBH_U32::~Inst_VOP1__V_FFBH_U32()
-    {
-    } // ~Inst_VOP1__V_FFBH_U32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from MSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP1__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOneMsb(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FFBL_B32 class methods ---
-
-    Inst_VOP1__V_FFBL_B32::Inst_VOP1__V_FFBL_B32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbl_b32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBL_B32
-
-    Inst_VOP1__V_FFBL_B32::~Inst_VOP1__V_FFBL_B32()
-    {
-    } // ~Inst_VOP1__V_FFBL_B32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from LSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP1__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOne(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FFBH_I32 class methods ---
-
-    Inst_VOP1__V_FFBH_I32::Inst_VOP1__V_FFBH_I32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ffbh_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_FFBH_I32
-
-    Inst_VOP1__V_FFBH_I32::~Inst_VOP1__V_FFBH_I32()
-    {
-    } // ~Inst_VOP1__V_FFBH_I32
-
-    // --- description from .arch file ---
-    // D.u = position of first bit different from sign bit in S0.i from MSB;
-    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
-    void
-    Inst_VOP1__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = firstOppositeSignBit(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_EXP_I32_F64 class methods ---
-
-    Inst_VOP1__V_FREXP_EXP_I32_F64::Inst_VOP1__V_FREXP_EXP_I32_F64(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FREXP_EXP_I32_F64
-
-    Inst_VOP1__V_FREXP_EXP_I32_F64::~Inst_VOP1__V_FREXP_EXP_I32_F64()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I32_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_EXP_I32_F32.
-    void
-    Inst_VOP1__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp = 0;
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_MANT_F64 class methods ---
-
-    Inst_VOP1__V_FREXP_MANT_F64::Inst_VOP1__V_FREXP_MANT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FREXP_MANT_F64
-
-    Inst_VOP1__V_FREXP_MANT_F64::~Inst_VOP1__V_FREXP_MANT_F64()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_MANT_F32.
-    void
-    Inst_VOP1__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FRACT_F64 class methods ---
-
-    Inst_VOP1__V_FRACT_F64::Inst_VOP1__V_FRACT_F64(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP1__V_FRACT_F64
-
-    Inst_VOP1__V_FRACT_F64::~Inst_VOP1__V_FRACT_F64()
-    {
-    } // ~Inst_VOP1__V_FRACT_F64
-
-    // --- description from .arch file ---
-    // See V_FRACT_F32.
-    void
-    Inst_VOP1__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF64 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_EXP_I32_F32 class methods ---
-
-    Inst_VOP1__V_FREXP_EXP_I32_F32::Inst_VOP1__V_FREXP_EXP_I32_F32(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FREXP_EXP_I32_F32
-
-    Inst_VOP1__V_FREXP_EXP_I32_F32::~Inst_VOP1__V_FREXP_EXP_I32_F32()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I32_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.i = 0;
-    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
-    // Returns exponent of single precision float input, such that S0.f =
-    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
-    // the significand.
-    void
-    Inst_VOP1__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_MANT_F32 class methods ---
-
-    Inst_VOP1__V_FREXP_MANT_F32::Inst_VOP1__V_FREXP_MANT_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_FREXP_MANT_F32
-
-    Inst_VOP1__V_FREXP_MANT_F32::~Inst_VOP1__V_FREXP_MANT_F32()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.f = S0.f;
-    // else D.f = Mantissa(S0.f).
-    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
-    // ---  significand of single precision float input, such that S0.f =
-    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
-    // ---  returns integer exponent.
-    void
-    Inst_VOP1__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_CLREXCP class methods ---
-
-    Inst_VOP1__V_CLREXCP::Inst_VOP1__V_CLREXCP(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_clrexcp")
-    {
-        setFlag(ALU);
-    } // Inst_VOP1__V_CLREXCP
-
-    Inst_VOP1__V_CLREXCP::~Inst_VOP1__V_CLREXCP()
-    {
-    } // ~Inst_VOP1__V_CLREXCP
-
-    // --- description from .arch file ---
-    // Clear wave's exception state in SIMD (SP).
-    void
-    Inst_VOP1__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F16_U16 class methods ---
-
-    Inst_VOP1__V_CVT_F16_U16::Inst_VOP1__V_CVT_F16_U16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_u16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_F16_U16
-
-    Inst_VOP1__V_CVT_F16_U16::~Inst_VOP1__V_CVT_F16_U16()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_U16
-
-    // --- description from .arch file ---
-    // D.f16 = uint16_to_flt16(S.u16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_F16_I16 class methods ---
-
-    Inst_VOP1__V_CVT_F16_I16::Inst_VOP1__V_CVT_F16_I16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_f16_i16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_F16_I16
-
-    Inst_VOP1__V_CVT_F16_I16::~Inst_VOP1__V_CVT_F16_I16()
-    {
-    } // ~Inst_VOP1__V_CVT_F16_I16
-
-    // --- description from .arch file ---
-    // D.f16 = int16_to_flt16(S.i16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_U16_F16 class methods ---
-
-    Inst_VOP1__V_CVT_U16_F16::Inst_VOP1__V_CVT_U16_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_u16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_U16_F16
-
-    Inst_VOP1__V_CVT_U16_F16::~Inst_VOP1__V_CVT_U16_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_U16_F16
-
-    // --- description from .arch file ---
-    // D.u16 = flt16_to_uint16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CVT_I16_F16 class methods ---
-
-    Inst_VOP1__V_CVT_I16_F16::Inst_VOP1__V_CVT_I16_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cvt_i16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CVT_I16_F16
-
-    Inst_VOP1__V_CVT_I16_F16::~Inst_VOP1__V_CVT_I16_F16()
-    {
-    } // ~Inst_VOP1__V_CVT_I16_F16
-
-    // --- description from .arch file ---
-    // D.i16 = flt16_to_int16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP1__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_RCP_F16 class methods ---
-
-    Inst_VOP1__V_RCP_F16::Inst_VOP1__V_RCP_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rcp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RCP_F16
-
-    Inst_VOP1__V_RCP_F16::~Inst_VOP1__V_RCP_F16()
-    {
-    } // ~Inst_VOP1__V_RCP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecip(S0.f16).
-    void
-    Inst_VOP1__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_SQRT_F16 class methods ---
-
-    Inst_VOP1__V_SQRT_F16::Inst_VOP1__V_SQRT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sqrt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_SQRT_F16
-
-    Inst_VOP1__V_SQRT_F16::~Inst_VOP1__V_SQRT_F16()
-    {
-    } // ~Inst_VOP1__V_SQRT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateSqrt(S0.f16).
-    void
-    Inst_VOP1__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_RSQ_F16 class methods ---
-
-    Inst_VOP1__V_RSQ_F16::Inst_VOP1__V_RSQ_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rsq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RSQ_F16
-
-    Inst_VOP1__V_RSQ_F16::~Inst_VOP1__V_RSQ_F16()
-    {
-    } // ~Inst_VOP1__V_RSQ_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecipSqrt(S0.f16).
-    void
-    Inst_VOP1__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_LOG_F16 class methods ---
-
-    Inst_VOP1__V_LOG_F16::Inst_VOP1__V_LOG_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_LOG_F16
-
-    Inst_VOP1__V_LOG_F16::~Inst_VOP1__V_LOG_F16()
-    {
-    } // ~Inst_VOP1__V_LOG_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 0.0f;
-    // else
-    //     D.f16 = ApproximateLog2(S0.f16).
-    void
-    Inst_VOP1__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_EXP_F16 class methods ---
-
-    Inst_VOP1__V_EXP_F16::Inst_VOP1__V_EXP_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_EXP_F16
-
-    Inst_VOP1__V_EXP_F16::~Inst_VOP1__V_EXP_F16()
-    {
-    } // ~Inst_VOP1__V_EXP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 0.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = Approximate2ToX(S0.f16).
-    void
-    Inst_VOP1__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_MANT_F16 class methods ---
-
-    Inst_VOP1__V_FREXP_MANT_F16::Inst_VOP1__V_FREXP_MANT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_mant_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FREXP_MANT_F16
-
-    Inst_VOP1__V_FREXP_MANT_F16::~Inst_VOP1__V_FREXP_MANT_F16()
-    {
-    } // ~Inst_VOP1__V_FREXP_MANT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.f16 = S0.f16;
-    // else
-    //     D.f16 = mantissa(S0.f16).
-    // Result range is (-1.0,-0.5][0.5,1.0).
-    // C math library frexp function.
-    // Returns binary significand of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP1__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FREXP_EXP_I16_F16 class methods ---
-
-    Inst_VOP1__V_FREXP_EXP_I16_F16::Inst_VOP1__V_FREXP_EXP_I16_F16(
-          InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_frexp_exp_i16_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FREXP_EXP_I16_F16
-
-    Inst_VOP1__V_FREXP_EXP_I16_F16::~Inst_VOP1__V_FREXP_EXP_I16_F16()
-    {
-    } // ~Inst_VOP1__V_FREXP_EXP_I16_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.i16 = 0;
-    // else
-    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
-    // C math library frexp function.
-    // Returns exponent of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP1__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FLOOR_F16 class methods ---
-
-    Inst_VOP1__V_FLOOR_F16::Inst_VOP1__V_FLOOR_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_floor_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FLOOR_F16
-
-    Inst_VOP1__V_FLOOR_F16::~Inst_VOP1__V_FLOOR_F16()
-    {
-    } // ~Inst_VOP1__V_FLOOR_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
-    void
-    Inst_VOP1__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_CEIL_F16 class methods ---
-
-    Inst_VOP1__V_CEIL_F16::Inst_VOP1__V_CEIL_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_ceil_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_CEIL_F16
-
-    Inst_VOP1__V_CEIL_F16::~Inst_VOP1__V_CEIL_F16()
-    {
-    } // ~Inst_VOP1__V_CEIL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
-    void
-    Inst_VOP1__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_TRUNC_F16 class methods ---
-
-    Inst_VOP1__V_TRUNC_F16::Inst_VOP1__V_TRUNC_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_trunc_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_TRUNC_F16
-
-    Inst_VOP1__V_TRUNC_F16::~Inst_VOP1__V_TRUNC_F16()
-    {
-    } // ~Inst_VOP1__V_TRUNC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16).
-    // Round-to-zero semantics.
-    void
-    Inst_VOP1__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_RNDNE_F16 class methods ---
-
-    Inst_VOP1__V_RNDNE_F16::Inst_VOP1__V_RNDNE_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_rndne_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_RNDNE_F16
-
-    Inst_VOP1__V_RNDNE_F16::~Inst_VOP1__V_RNDNE_F16()
-    {
-    } // ~Inst_VOP1__V_RNDNE_F16
-
-    // --- description from .arch file ---
-    // D.f16 = FLOOR(S0.f16 + 0.5f);
-    // if(floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
-    // Round-to-nearest-even semantics.
-    void
-    Inst_VOP1__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_FRACT_F16 class methods ---
-
-    Inst_VOP1__V_FRACT_F16::Inst_VOP1__V_FRACT_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_fract_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_FRACT_F16
-
-    Inst_VOP1__V_FRACT_F16::~Inst_VOP1__V_FRACT_F16()
-    {
-    } // ~Inst_VOP1__V_FRACT_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + -floor(S0.f16).
-    void
-    Inst_VOP1__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_SIN_F16 class methods ---
-
-    Inst_VOP1__V_SIN_F16::Inst_VOP1__V_SIN_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_sin_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_SIN_F16
-
-    Inst_VOP1__V_SIN_F16::~Inst_VOP1__V_SIN_F16()
-    {
-    } // ~Inst_VOP1__V_SIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = sin(S0.f16 * 2 * PI).
-    void
-    Inst_VOP1__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_COS_F16 class methods ---
-
-    Inst_VOP1__V_COS_F16::Inst_VOP1__V_COS_F16(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_cos_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP1__V_COS_F16
-
-    Inst_VOP1__V_COS_F16::~Inst_VOP1__V_COS_F16()
-    {
-    } // ~Inst_VOP1__V_COS_F16
-
-    // --- description from .arch file ---
-    // D.f16 = cos(S0.f16 * 2 * PI).
-    void
-    Inst_VOP1__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP1__V_EXP_LEGACY_F32 class methods ---
-
-    Inst_VOP1__V_EXP_LEGACY_F32::Inst_VOP1__V_EXP_LEGACY_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_exp_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_EXP_LEGACY_F32
-
-    Inst_VOP1__V_EXP_LEGACY_F32::~Inst_VOP1__V_EXP_LEGACY_F32()
-    {
-    } // ~Inst_VOP1__V_EXP_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f) with legacy semantics.
-    void
-    Inst_VOP1__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP1__V_LOG_LEGACY_F32 class methods ---
-
-    Inst_VOP1__V_LOG_LEGACY_F32::Inst_VOP1__V_LOG_LEGACY_F32(InFmt_VOP1 *iFmt)
-        : Inst_VOP1(iFmt, "v_log_legacy_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP1__V_LOG_LEGACY_F32
-
-    Inst_VOP1__V_LOG_LEGACY_F32::~Inst_VOP1__V_LOG_LEGACY_F32()
-    {
-    } // ~Inst_VOP1__V_LOG_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
-    void
-    Inst_VOP1__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_CLASS_F32 class methods ---
-
-    Inst_VOPC__V_CMP_CLASS_F32::Inst_VOPC__V_CMP_CLASS_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_CLASS_F32
-
-    Inst_VOPC__V_CMP_CLASS_F32::~Inst_VOPC__V_CMP_CLASS_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F32
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_CLASS_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_CLASS_F32::Inst_VOPC__V_CMPX_CLASS_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_CLASS_F32
-
-    Inst_VOPC__V_CMPX_CLASS_F32::~Inst_VOPC__V_CMPX_CLASS_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F32
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.f The function reports true if the floating point value is *any* of
-    // the numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMP_CLASS_F64 class methods ---
-
-    Inst_VOPC__V_CMP_CLASS_F64::Inst_VOPC__V_CMP_CLASS_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_CLASS_F64
-
-    Inst_VOPC__V_CMP_CLASS_F64::~Inst_VOPC__V_CMP_CLASS_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F64
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_CLASS_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_CLASS_F64::Inst_VOPC__V_CMPX_CLASS_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_CLASS_F64
-
-    Inst_VOPC__V_CMPX_CLASS_F64::~Inst_VOPC__V_CMPX_CLASS_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F64
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.d The function reports true if the floating point value is *any* of
-    // the numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        vcc.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMP_CLASS_F16 class methods ---
-
-    Inst_VOPC__V_CMP_CLASS_F16::Inst_VOPC__V_CMP_CLASS_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_class_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_CLASS_F16
-
-    Inst_VOPC__V_CMP_CLASS_F16::~Inst_VOPC__V_CMP_CLASS_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_CLASS_F16
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_CLASS_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_CLASS_F16::Inst_VOPC__V_CMPX_CLASS_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_class_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_CLASS_F16
-
-    Inst_VOPC__V_CMPX_CLASS_F16::~Inst_VOPC__V_CMPX_CLASS_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_CLASS_F16
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // ---  S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOPC__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_F16 class methods ---
-
-    Inst_VOPC__V_CMP_F_F16::Inst_VOPC__V_CMP_F_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_F_F16
-
-    Inst_VOPC__V_CMP_F_F16::~Inst_VOPC__V_CMP_F_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_LT_F16::Inst_VOPC__V_CMP_LT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LT_F16
-
-    Inst_VOPC__V_CMP_LT_F16::~Inst_VOPC__V_CMP_LT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_F16 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_F16::Inst_VOPC__V_CMP_EQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_EQ_F16
-
-    Inst_VOPC__V_CMP_EQ_F16::~Inst_VOPC__V_CMP_EQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_LE_F16::Inst_VOPC__V_CMP_LE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LE_F16
-
-    Inst_VOPC__V_CMP_LE_F16::~Inst_VOPC__V_CMP_LE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_GT_F16::Inst_VOPC__V_CMP_GT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_GT_F16
-
-    Inst_VOPC__V_CMP_GT_F16::~Inst_VOPC__V_CMP_GT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LG_F16 class methods ---
-
-    Inst_VOPC__V_CMP_LG_F16::Inst_VOPC__V_CMP_LG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_LG_F16
-
-    Inst_VOPC__V_CMP_LG_F16::~Inst_VOPC__V_CMP_LG_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_GE_F16::Inst_VOPC__V_CMP_GE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_GE_F16
-
-    Inst_VOPC__V_CMP_GE_F16::~Inst_VOPC__V_CMP_GE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_O_F16 class methods ---
-
-    Inst_VOPC__V_CMP_O_F16::Inst_VOPC__V_CMP_O_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_O_F16
-
-    Inst_VOPC__V_CMP_O_F16::~Inst_VOPC__V_CMP_O_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_U_F16 class methods ---
-
-    Inst_VOPC__V_CMP_U_F16::Inst_VOPC__V_CMP_U_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_U_F16
-
-    Inst_VOPC__V_CMP_U_F16::~Inst_VOPC__V_CMP_U_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NGE_F16::Inst_VOPC__V_CMP_NGE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NGE_F16
-
-    Inst_VOPC__V_CMP_NGE_F16::~Inst_VOPC__V_CMP_NGE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLG_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NLG_F16::Inst_VOPC__V_CMP_NLG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLG_F16
-
-    Inst_VOPC__V_CMP_NLG_F16::~Inst_VOPC__V_CMP_NLG_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NGT_F16::Inst_VOPC__V_CMP_NGT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NGT_F16
-
-    Inst_VOPC__V_CMP_NGT_F16::~Inst_VOPC__V_CMP_NGT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLE_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NLE_F16::Inst_VOPC__V_CMP_NLE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLE_F16
-
-    Inst_VOPC__V_CMP_NLE_F16::~Inst_VOPC__V_CMP_NLE_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NEQ_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NEQ_F16::Inst_VOPC__V_CMP_NEQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NEQ_F16
-
-    Inst_VOPC__V_CMP_NEQ_F16::~Inst_VOPC__V_CMP_NEQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLT_F16 class methods ---
-
-    Inst_VOPC__V_CMP_NLT_F16::Inst_VOPC__V_CMP_NLT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_NLT_F16
-
-    Inst_VOPC__V_CMP_NLT_F16::~Inst_VOPC__V_CMP_NLT_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_TRU_F16 class methods ---
-
-    Inst_VOPC__V_CMP_TRU_F16::Inst_VOPC__V_CMP_TRU_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOPC__V_CMP_TRU_F16
-
-    Inst_VOPC__V_CMP_TRU_F16::~Inst_VOPC__V_CMP_TRU_F16()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_F_F16::Inst_VOPC__V_CMPX_F_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_F16
-
-    Inst_VOPC__V_CMPX_F_F16::~Inst_VOPC__V_CMPX_F_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_F16::Inst_VOPC__V_CMPX_LT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_F16
-
-    Inst_VOPC__V_CMPX_LT_F16::~Inst_VOPC__V_CMPX_LT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_F16::Inst_VOPC__V_CMPX_EQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_F16
-
-    Inst_VOPC__V_CMPX_EQ_F16::~Inst_VOPC__V_CMPX_EQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_F16::Inst_VOPC__V_CMPX_LE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_F16
-
-    Inst_VOPC__V_CMPX_LE_F16::~Inst_VOPC__V_CMPX_LE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_F16::Inst_VOPC__V_CMPX_GT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_F16
-
-    Inst_VOPC__V_CMPX_GT_F16::~Inst_VOPC__V_CMPX_GT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LG_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_LG_F16::Inst_VOPC__V_CMPX_LG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LG_F16
-
-    Inst_VOPC__V_CMPX_LG_F16::~Inst_VOPC__V_CMPX_LG_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_F16::Inst_VOPC__V_CMPX_GE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_F16
-
-    Inst_VOPC__V_CMPX_GE_F16::~Inst_VOPC__V_CMPX_GE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_O_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_O_F16::Inst_VOPC__V_CMPX_O_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_O_F16
-
-    Inst_VOPC__V_CMPX_O_F16::~Inst_VOPC__V_CMPX_O_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_U_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_U_F16::Inst_VOPC__V_CMPX_U_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_U_F16
-
-    Inst_VOPC__V_CMPX_U_F16::~Inst_VOPC__V_CMPX_U_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NGE_F16::Inst_VOPC__V_CMPX_NGE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGE_F16
-
-    Inst_VOPC__V_CMPX_NGE_F16::~Inst_VOPC__V_CMPX_NGE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLG_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NLG_F16::Inst_VOPC__V_CMPX_NLG_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLG_F16
-
-    Inst_VOPC__V_CMPX_NLG_F16::~Inst_VOPC__V_CMPX_NLG_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NGT_F16::Inst_VOPC__V_CMPX_NGT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGT_F16
-
-    Inst_VOPC__V_CMPX_NGT_F16::~Inst_VOPC__V_CMPX_NGT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLE_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NLE_F16::Inst_VOPC__V_CMPX_NLE_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLE_F16
-
-    Inst_VOPC__V_CMPX_NLE_F16::~Inst_VOPC__V_CMPX_NLE_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NEQ_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NEQ_F16::Inst_VOPC__V_CMPX_NEQ_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NEQ_F16
-
-    Inst_VOPC__V_CMPX_NEQ_F16::~Inst_VOPC__V_CMPX_NEQ_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLT_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_NLT_F16::Inst_VOPC__V_CMPX_NLT_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLT_F16
-
-    Inst_VOPC__V_CMPX_NLT_F16::~Inst_VOPC__V_CMPX_NLT_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_TRU_F16 class methods ---
-
-    Inst_VOPC__V_CMPX_TRU_F16::Inst_VOPC__V_CMPX_TRU_F16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f16")
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_TRU_F16
-
-    Inst_VOPC__V_CMPX_TRU_F16::~Inst_VOPC__V_CMPX_TRU_F16()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_F32 class methods ---
-
-    Inst_VOPC__V_CMP_F_F32::Inst_VOPC__V_CMP_F_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_F_F32
-
-    Inst_VOPC__V_CMP_F_F32::~Inst_VOPC__V_CMP_F_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_LT_F32::Inst_VOPC__V_CMP_LT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LT_F32
-
-    Inst_VOPC__V_CMP_LT_F32::~Inst_VOPC__V_CMP_LT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_F32 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_F32::Inst_VOPC__V_CMP_EQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_EQ_F32
-
-    Inst_VOPC__V_CMP_EQ_F32::~Inst_VOPC__V_CMP_EQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_LE_F32::Inst_VOPC__V_CMP_LE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LE_F32
-
-    Inst_VOPC__V_CMP_LE_F32::~Inst_VOPC__V_CMP_LE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_GT_F32::Inst_VOPC__V_CMP_GT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_GT_F32
-
-    Inst_VOPC__V_CMP_GT_F32::~Inst_VOPC__V_CMP_GT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LG_F32 class methods ---
-
-    Inst_VOPC__V_CMP_LG_F32::Inst_VOPC__V_CMP_LG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_LG_F32
-
-    Inst_VOPC__V_CMP_LG_F32::~Inst_VOPC__V_CMP_LG_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_GE_F32::Inst_VOPC__V_CMP_GE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_GE_F32
-
-    Inst_VOPC__V_CMP_GE_F32::~Inst_VOPC__V_CMP_GE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_O_F32 class methods ---
-
-    Inst_VOPC__V_CMP_O_F32::Inst_VOPC__V_CMP_O_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_O_F32
-
-    Inst_VOPC__V_CMP_O_F32::~Inst_VOPC__V_CMP_O_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_U_F32 class methods ---
-
-    Inst_VOPC__V_CMP_U_F32::Inst_VOPC__V_CMP_U_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_U_F32
-
-    Inst_VOPC__V_CMP_U_F32::~Inst_VOPC__V_CMP_U_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NGE_F32::Inst_VOPC__V_CMP_NGE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NGE_F32
-
-    Inst_VOPC__V_CMP_NGE_F32::~Inst_VOPC__V_CMP_NGE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLG_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NLG_F32::Inst_VOPC__V_CMP_NLG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLG_F32
-
-    Inst_VOPC__V_CMP_NLG_F32::~Inst_VOPC__V_CMP_NLG_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NGT_F32::Inst_VOPC__V_CMP_NGT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NGT_F32
-
-    Inst_VOPC__V_CMP_NGT_F32::~Inst_VOPC__V_CMP_NGT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLE_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NLE_F32::Inst_VOPC__V_CMP_NLE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLE_F32
-
-    Inst_VOPC__V_CMP_NLE_F32::~Inst_VOPC__V_CMP_NLE_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NEQ_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NEQ_F32::Inst_VOPC__V_CMP_NEQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NEQ_F32
-
-    Inst_VOPC__V_CMP_NEQ_F32::~Inst_VOPC__V_CMP_NEQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLT_F32 class methods ---
-
-    Inst_VOPC__V_CMP_NLT_F32::Inst_VOPC__V_CMP_NLT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_NLT_F32
-
-    Inst_VOPC__V_CMP_NLT_F32::~Inst_VOPC__V_CMP_NLT_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_TRU_F32 class methods ---
-
-    Inst_VOPC__V_CMP_TRU_F32::Inst_VOPC__V_CMP_TRU_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOPC__V_CMP_TRU_F32
-
-    Inst_VOPC__V_CMP_TRU_F32::~Inst_VOPC__V_CMP_TRU_F32()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_F_F32::Inst_VOPC__V_CMPX_F_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_F32
-
-    Inst_VOPC__V_CMPX_F_F32::~Inst_VOPC__V_CMPX_F_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_F32::Inst_VOPC__V_CMPX_LT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_F32
-
-    Inst_VOPC__V_CMPX_LT_F32::~Inst_VOPC__V_CMPX_LT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_F32::Inst_VOPC__V_CMPX_EQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_F32
-
-    Inst_VOPC__V_CMPX_EQ_F32::~Inst_VOPC__V_CMPX_EQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_F32::Inst_VOPC__V_CMPX_LE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_F32
-
-    Inst_VOPC__V_CMPX_LE_F32::~Inst_VOPC__V_CMPX_LE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_F32::Inst_VOPC__V_CMPX_GT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_F32
-
-    Inst_VOPC__V_CMPX_GT_F32::~Inst_VOPC__V_CMPX_GT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LG_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_LG_F32::Inst_VOPC__V_CMPX_LG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LG_F32
-
-    Inst_VOPC__V_CMPX_LG_F32::~Inst_VOPC__V_CMPX_LG_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_F32::Inst_VOPC__V_CMPX_GE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_F32
-
-    Inst_VOPC__V_CMPX_GE_F32::~Inst_VOPC__V_CMPX_GE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_O_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_O_F32::Inst_VOPC__V_CMPX_O_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_O_F32
-
-    Inst_VOPC__V_CMPX_O_F32::~Inst_VOPC__V_CMPX_O_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_U_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_U_F32::Inst_VOPC__V_CMPX_U_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_U_F32
-
-    Inst_VOPC__V_CMPX_U_F32::~Inst_VOPC__V_CMPX_U_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NGE_F32::Inst_VOPC__V_CMPX_NGE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGE_F32
-
-    Inst_VOPC__V_CMPX_NGE_F32::~Inst_VOPC__V_CMPX_NGE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLG_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NLG_F32::Inst_VOPC__V_CMPX_NLG_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLG_F32
-
-    Inst_VOPC__V_CMPX_NLG_F32::~Inst_VOPC__V_CMPX_NLG_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NGT_F32::Inst_VOPC__V_CMPX_NGT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGT_F32
-
-    Inst_VOPC__V_CMPX_NGT_F32::~Inst_VOPC__V_CMPX_NGT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLE_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NLE_F32::Inst_VOPC__V_CMPX_NLE_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLE_F32
-
-    Inst_VOPC__V_CMPX_NLE_F32::~Inst_VOPC__V_CMPX_NLE_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NEQ_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NEQ_F32::Inst_VOPC__V_CMPX_NEQ_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NEQ_F32
-
-    Inst_VOPC__V_CMPX_NEQ_F32::~Inst_VOPC__V_CMPX_NEQ_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] == src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLT_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_NLT_F32::Inst_VOPC__V_CMPX_NLT_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLT_F32
-
-    Inst_VOPC__V_CMPX_NLT_F32::~Inst_VOPC__V_CMPX_NLT_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_TRU_F32 class methods ---
-
-    Inst_VOPC__V_CMPX_TRU_F32::Inst_VOPC__V_CMPX_TRU_F32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_TRU_F32
-
-    Inst_VOPC__V_CMPX_TRU_F32::~Inst_VOPC__V_CMPX_TRU_F32()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_F64 class methods ---
-
-    Inst_VOPC__V_CMP_F_F64::Inst_VOPC__V_CMP_F_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_F_F64
-
-    Inst_VOPC__V_CMP_F_F64::~Inst_VOPC__V_CMP_F_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_LT_F64::Inst_VOPC__V_CMP_LT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LT_F64
-
-    Inst_VOPC__V_CMP_LT_F64::~Inst_VOPC__V_CMP_LT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_F64 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_F64::Inst_VOPC__V_CMP_EQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_EQ_F64
-
-    Inst_VOPC__V_CMP_EQ_F64::~Inst_VOPC__V_CMP_EQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_LE_F64::Inst_VOPC__V_CMP_LE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LE_F64
-
-    Inst_VOPC__V_CMP_LE_F64::~Inst_VOPC__V_CMP_LE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_GT_F64::Inst_VOPC__V_CMP_GT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_GT_F64
-
-    Inst_VOPC__V_CMP_GT_F64::~Inst_VOPC__V_CMP_GT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LG_F64 class methods ---
-
-    Inst_VOPC__V_CMP_LG_F64::Inst_VOPC__V_CMP_LG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_LG_F64
-
-    Inst_VOPC__V_CMP_LG_F64::~Inst_VOPC__V_CMP_LG_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_LG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_GE_F64::Inst_VOPC__V_CMP_GE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_GE_F64
-
-    Inst_VOPC__V_CMP_GE_F64::~Inst_VOPC__V_CMP_GE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_O_F64 class methods ---
-
-    Inst_VOPC__V_CMP_O_F64::Inst_VOPC__V_CMP_O_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_o_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_O_F64
-
-    Inst_VOPC__V_CMP_O_F64::~Inst_VOPC__V_CMP_O_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_O_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_U_F64 class methods ---
-
-    Inst_VOPC__V_CMP_U_F64::Inst_VOPC__V_CMP_U_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_u_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_U_F64
-
-    Inst_VOPC__V_CMP_U_F64::~Inst_VOPC__V_CMP_U_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_U_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NGE_F64::Inst_VOPC__V_CMP_NGE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NGE_F64
-
-    Inst_VOPC__V_CMP_NGE_F64::~Inst_VOPC__V_CMP_NGE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NGE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLG_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NLG_F64::Inst_VOPC__V_CMP_NLG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLG_F64
-
-    Inst_VOPC__V_CMP_NLG_F64::~Inst_VOPC__V_CMP_NLG_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NGT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NGT_F64::Inst_VOPC__V_CMP_NGT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ngt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NGT_F64
-
-    Inst_VOPC__V_CMP_NGT_F64::~Inst_VOPC__V_CMP_NGT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NGT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLE_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NLE_F64::Inst_VOPC__V_CMP_NLE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nle_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLE_F64
-
-    Inst_VOPC__V_CMP_NLE_F64::~Inst_VOPC__V_CMP_NLE_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NEQ_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NEQ_F64::Inst_VOPC__V_CMP_NEQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_neq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NEQ_F64
-
-    Inst_VOPC__V_CMP_NEQ_F64::~Inst_VOPC__V_CMP_NEQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NEQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NLT_F64 class methods ---
-
-    Inst_VOPC__V_CMP_NLT_F64::Inst_VOPC__V_CMP_NLT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_nlt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_NLT_F64
-
-    Inst_VOPC__V_CMP_NLT_F64::~Inst_VOPC__V_CMP_NLT_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_NLT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_TRU_F64 class methods ---
-
-    Inst_VOPC__V_CMP_TRU_F64::Inst_VOPC__V_CMP_TRU_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_tru_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOPC__V_CMP_TRU_F64
-
-    Inst_VOPC__V_CMP_TRU_F64::~Inst_VOPC__V_CMP_TRU_F64()
-    {
-    } // ~Inst_VOPC__V_CMP_TRU_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_F_F64::Inst_VOPC__V_CMPX_F_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_F64
-
-    Inst_VOPC__V_CMPX_F_F64::~Inst_VOPC__V_CMPX_F_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_F64::Inst_VOPC__V_CMPX_LT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_F64
-
-    Inst_VOPC__V_CMPX_LT_F64::~Inst_VOPC__V_CMPX_LT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_F64::Inst_VOPC__V_CMPX_EQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_F64
-
-    Inst_VOPC__V_CMPX_EQ_F64::~Inst_VOPC__V_CMPX_EQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-        wf->execMask() = vcc.rawData();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_F64::Inst_VOPC__V_CMPX_LE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_F64
-
-    Inst_VOPC__V_CMPX_LE_F64::~Inst_VOPC__V_CMPX_LE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_F64::Inst_VOPC__V_CMPX_GT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_F64
-
-    Inst_VOPC__V_CMPX_GT_F64::~Inst_VOPC__V_CMPX_GT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LG_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_LG_F64::Inst_VOPC__V_CMPX_LG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LG_F64
-
-    Inst_VOPC__V_CMPX_LG_F64::~Inst_VOPC__V_CMPX_LG_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_F64::Inst_VOPC__V_CMPX_GE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_F64
-
-    Inst_VOPC__V_CMPX_GE_F64::~Inst_VOPC__V_CMPX_GE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_O_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_O_F64::Inst_VOPC__V_CMPX_O_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_o_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_O_F64
-
-    Inst_VOPC__V_CMPX_O_F64::~Inst_VOPC__V_CMPX_O_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_O_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_U_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_U_F64::Inst_VOPC__V_CMPX_U_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_u_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_U_F64
-
-    Inst_VOPC__V_CMPX_U_F64::~Inst_VOPC__V_CMPX_U_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_U_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOPC__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NGE_F64::Inst_VOPC__V_CMPX_NGE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nge_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGE_F64
-
-    Inst_VOPC__V_CMPX_NGE_F64::~Inst_VOPC__V_CMPX_NGE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLG_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NLG_F64::Inst_VOPC__V_CMPX_NLG_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlg_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLG_F64
-
-    Inst_VOPC__V_CMPX_NLG_F64::~Inst_VOPC__V_CMPX_NLG_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NGT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NGT_F64::Inst_VOPC__V_CMPX_NGT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ngt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NGT_F64
-
-    Inst_VOPC__V_CMPX_NGT_F64::~Inst_VOPC__V_CMPX_NGT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NGT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLE_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NLE_F64::Inst_VOPC__V_CMPX_NLE_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nle_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLE_F64
-
-    Inst_VOPC__V_CMPX_NLE_F64::~Inst_VOPC__V_CMPX_NLE_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NEQ_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NEQ_F64::Inst_VOPC__V_CMPX_NEQ_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_neq_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NEQ_F64
-
-    Inst_VOPC__V_CMPX_NEQ_F64::~Inst_VOPC__V_CMPX_NEQ_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NEQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NLT_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_NLT_F64::Inst_VOPC__V_CMPX_NLT_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_nlt_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NLT_F64
-
-    Inst_VOPC__V_CMPX_NLT_F64::~Inst_VOPC__V_CMPX_NLT_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NLT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_TRU_F64 class methods ---
-
-    Inst_VOPC__V_CMPX_TRU_F64::Inst_VOPC__V_CMPX_TRU_F64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_tru_f64")
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_TRU_F64
-
-    Inst_VOPC__V_CMPX_TRU_F64::~Inst_VOPC__V_CMPX_TRU_F64()
-    {
-    } // ~Inst_VOPC__V_CMPX_TRU_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_I16 class methods ---
-
-    Inst_VOPC__V_CMP_F_I16::Inst_VOPC__V_CMP_F_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I16
-
-    Inst_VOPC__V_CMP_F_I16::~Inst_VOPC__V_CMP_F_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_I16 class methods ---
-
-    Inst_VOPC__V_CMP_LT_I16::Inst_VOPC__V_CMP_LT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I16
-
-    Inst_VOPC__V_CMP_LT_I16::~Inst_VOPC__V_CMP_LT_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_I16 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_I16::Inst_VOPC__V_CMP_EQ_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I16
-
-    Inst_VOPC__V_CMP_EQ_I16::~Inst_VOPC__V_CMP_EQ_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_I16 class methods ---
-
-    Inst_VOPC__V_CMP_LE_I16::Inst_VOPC__V_CMP_LE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I16
-
-    Inst_VOPC__V_CMP_LE_I16::~Inst_VOPC__V_CMP_LE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_I16 class methods ---
-
-    Inst_VOPC__V_CMP_GT_I16::Inst_VOPC__V_CMP_GT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I16
-
-    Inst_VOPC__V_CMP_GT_I16::~Inst_VOPC__V_CMP_GT_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_I16 class methods ---
-
-    Inst_VOPC__V_CMP_NE_I16::Inst_VOPC__V_CMP_NE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I16
-
-    Inst_VOPC__V_CMP_NE_I16::~Inst_VOPC__V_CMP_NE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_I16 class methods ---
-
-    Inst_VOPC__V_CMP_GE_I16::Inst_VOPC__V_CMP_GE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I16
-
-    Inst_VOPC__V_CMP_GE_I16::~Inst_VOPC__V_CMP_GE_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_I16 class methods ---
-
-    Inst_VOPC__V_CMP_T_I16::Inst_VOPC__V_CMP_T_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I16
-
-    Inst_VOPC__V_CMP_T_I16::~Inst_VOPC__V_CMP_T_I16()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_U16 class methods ---
-
-    Inst_VOPC__V_CMP_F_U16::Inst_VOPC__V_CMP_F_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U16
-
-    Inst_VOPC__V_CMP_F_U16::~Inst_VOPC__V_CMP_F_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_U16 class methods ---
-
-    Inst_VOPC__V_CMP_LT_U16::Inst_VOPC__V_CMP_LT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U16
-
-    Inst_VOPC__V_CMP_LT_U16::~Inst_VOPC__V_CMP_LT_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_U16 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_U16::Inst_VOPC__V_CMP_EQ_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U16
-
-    Inst_VOPC__V_CMP_EQ_U16::~Inst_VOPC__V_CMP_EQ_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_U16 class methods ---
-
-    Inst_VOPC__V_CMP_LE_U16::Inst_VOPC__V_CMP_LE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U16
-
-    Inst_VOPC__V_CMP_LE_U16::~Inst_VOPC__V_CMP_LE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_U16 class methods ---
-
-    Inst_VOPC__V_CMP_GT_U16::Inst_VOPC__V_CMP_GT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U16
-
-    Inst_VOPC__V_CMP_GT_U16::~Inst_VOPC__V_CMP_GT_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_U16 class methods ---
-
-    Inst_VOPC__V_CMP_NE_U16::Inst_VOPC__V_CMP_NE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U16
-
-    Inst_VOPC__V_CMP_NE_U16::~Inst_VOPC__V_CMP_NE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_U16 class methods ---
-
-    Inst_VOPC__V_CMP_GE_U16::Inst_VOPC__V_CMP_GE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U16
-
-    Inst_VOPC__V_CMP_GE_U16::~Inst_VOPC__V_CMP_GE_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_U16 class methods ---
-
-    Inst_VOPC__V_CMP_T_U16::Inst_VOPC__V_CMP_T_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u16")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U16
-
-    Inst_VOPC__V_CMP_T_U16::~Inst_VOPC__V_CMP_T_U16()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_F_I16::Inst_VOPC__V_CMPX_F_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_I16
-
-    Inst_VOPC__V_CMPX_F_I16::~Inst_VOPC__V_CMPX_F_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_I16::Inst_VOPC__V_CMPX_LT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_I16
-
-    Inst_VOPC__V_CMPX_LT_I16::~Inst_VOPC__V_CMPX_LT_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_I16::Inst_VOPC__V_CMPX_EQ_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_I16
-
-    Inst_VOPC__V_CMPX_EQ_I16::~Inst_VOPC__V_CMPX_EQ_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_I16::Inst_VOPC__V_CMPX_LE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_I16
-
-    Inst_VOPC__V_CMPX_LE_I16::~Inst_VOPC__V_CMPX_LE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_I16::Inst_VOPC__V_CMPX_GT_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_I16
-
-    Inst_VOPC__V_CMPX_GT_I16::~Inst_VOPC__V_CMPX_GT_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_I16::Inst_VOPC__V_CMPX_NE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_I16
-
-    Inst_VOPC__V_CMPX_NE_I16::~Inst_VOPC__V_CMPX_NE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_I16::Inst_VOPC__V_CMPX_GE_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_I16
-
-    Inst_VOPC__V_CMPX_GE_I16::~Inst_VOPC__V_CMPX_GE_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_I16 class methods ---
-
-    Inst_VOPC__V_CMPX_T_I16::Inst_VOPC__V_CMPX_T_I16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_I16
-
-    Inst_VOPC__V_CMPX_T_I16::~Inst_VOPC__V_CMPX_T_I16()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_F_U16::Inst_VOPC__V_CMPX_F_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_U16
-
-    Inst_VOPC__V_CMPX_F_U16::~Inst_VOPC__V_CMPX_F_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_U16::Inst_VOPC__V_CMPX_LT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_U16
-
-    Inst_VOPC__V_CMPX_LT_U16::~Inst_VOPC__V_CMPX_LT_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_U16::Inst_VOPC__V_CMPX_EQ_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_U16
-
-    Inst_VOPC__V_CMPX_EQ_U16::~Inst_VOPC__V_CMPX_EQ_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_U16::Inst_VOPC__V_CMPX_LE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_U16
-
-    Inst_VOPC__V_CMPX_LE_U16::~Inst_VOPC__V_CMPX_LE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_U16::Inst_VOPC__V_CMPX_GT_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_U16
-
-    Inst_VOPC__V_CMPX_GT_U16::~Inst_VOPC__V_CMPX_GT_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_U16::Inst_VOPC__V_CMPX_NE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_U16
-
-    Inst_VOPC__V_CMPX_NE_U16::~Inst_VOPC__V_CMPX_NE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_U16::Inst_VOPC__V_CMPX_GE_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_U16
-
-    Inst_VOPC__V_CMPX_GE_U16::~Inst_VOPC__V_CMPX_GE_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_U16 class methods ---
-
-    Inst_VOPC__V_CMPX_T_U16::Inst_VOPC__V_CMPX_T_U16(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u16")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_U16
-
-    Inst_VOPC__V_CMPX_T_U16::~Inst_VOPC__V_CMPX_T_U16()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_I32 class methods ---
-
-    Inst_VOPC__V_CMP_F_I32::Inst_VOPC__V_CMP_F_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I32
-
-    Inst_VOPC__V_CMP_F_I32::~Inst_VOPC__V_CMP_F_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_I32 class methods ---
-
-    Inst_VOPC__V_CMP_LT_I32::Inst_VOPC__V_CMP_LT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I32
-
-    Inst_VOPC__V_CMP_LT_I32::~Inst_VOPC__V_CMP_LT_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_I32 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_I32::Inst_VOPC__V_CMP_EQ_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I32
-
-    Inst_VOPC__V_CMP_EQ_I32::~Inst_VOPC__V_CMP_EQ_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_I32 class methods ---
-
-    Inst_VOPC__V_CMP_LE_I32::Inst_VOPC__V_CMP_LE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I32
-
-    Inst_VOPC__V_CMP_LE_I32::~Inst_VOPC__V_CMP_LE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_I32 class methods ---
-
-    Inst_VOPC__V_CMP_GT_I32::Inst_VOPC__V_CMP_GT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I32
-
-    Inst_VOPC__V_CMP_GT_I32::~Inst_VOPC__V_CMP_GT_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_I32 class methods ---
-
-    Inst_VOPC__V_CMP_NE_I32::Inst_VOPC__V_CMP_NE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I32
-
-    Inst_VOPC__V_CMP_NE_I32::~Inst_VOPC__V_CMP_NE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_I32 class methods ---
-
-    Inst_VOPC__V_CMP_GE_I32::Inst_VOPC__V_CMP_GE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I32
-
-    Inst_VOPC__V_CMP_GE_I32::~Inst_VOPC__V_CMP_GE_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_I32 class methods ---
-
-    Inst_VOPC__V_CMP_T_I32::Inst_VOPC__V_CMP_T_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I32
-
-    Inst_VOPC__V_CMP_T_I32::~Inst_VOPC__V_CMP_T_I32()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_U32 class methods ---
-
-    Inst_VOPC__V_CMP_F_U32::Inst_VOPC__V_CMP_F_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U32
-
-    Inst_VOPC__V_CMP_F_U32::~Inst_VOPC__V_CMP_F_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_U32 class methods ---
-
-    Inst_VOPC__V_CMP_LT_U32::Inst_VOPC__V_CMP_LT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U32
-
-    Inst_VOPC__V_CMP_LT_U32::~Inst_VOPC__V_CMP_LT_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_U32 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_U32::Inst_VOPC__V_CMP_EQ_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U32
-
-    Inst_VOPC__V_CMP_EQ_U32::~Inst_VOPC__V_CMP_EQ_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_U32 class methods ---
-
-    Inst_VOPC__V_CMP_LE_U32::Inst_VOPC__V_CMP_LE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U32
-
-    Inst_VOPC__V_CMP_LE_U32::~Inst_VOPC__V_CMP_LE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_U32 class methods ---
-
-    Inst_VOPC__V_CMP_GT_U32::Inst_VOPC__V_CMP_GT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U32
-
-    Inst_VOPC__V_CMP_GT_U32::~Inst_VOPC__V_CMP_GT_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_U32 class methods ---
-
-    Inst_VOPC__V_CMP_NE_U32::Inst_VOPC__V_CMP_NE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U32
-
-    Inst_VOPC__V_CMP_NE_U32::~Inst_VOPC__V_CMP_NE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_U32 class methods ---
-
-    Inst_VOPC__V_CMP_GE_U32::Inst_VOPC__V_CMP_GE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U32
-
-    Inst_VOPC__V_CMP_GE_U32::~Inst_VOPC__V_CMP_GE_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_U32 class methods ---
-
-    Inst_VOPC__V_CMP_T_U32::Inst_VOPC__V_CMP_T_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u32")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U32
-
-    Inst_VOPC__V_CMP_T_U32::~Inst_VOPC__V_CMP_T_U32()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_F_I32::Inst_VOPC__V_CMPX_F_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_I32
-
-    Inst_VOPC__V_CMPX_F_I32::~Inst_VOPC__V_CMPX_F_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_I32::Inst_VOPC__V_CMPX_LT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_I32
-
-    Inst_VOPC__V_CMPX_LT_I32::~Inst_VOPC__V_CMPX_LT_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_I32::Inst_VOPC__V_CMPX_EQ_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_I32
-
-    Inst_VOPC__V_CMPX_EQ_I32::~Inst_VOPC__V_CMPX_EQ_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_I32::Inst_VOPC__V_CMPX_LE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_I32
-
-    Inst_VOPC__V_CMPX_LE_I32::~Inst_VOPC__V_CMPX_LE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_I32::Inst_VOPC__V_CMPX_GT_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_I32
-
-    Inst_VOPC__V_CMPX_GT_I32::~Inst_VOPC__V_CMPX_GT_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_I32::Inst_VOPC__V_CMPX_NE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_I32
-
-    Inst_VOPC__V_CMPX_NE_I32::~Inst_VOPC__V_CMPX_NE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_I32::Inst_VOPC__V_CMPX_GE_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_I32
-
-    Inst_VOPC__V_CMPX_GE_I32::~Inst_VOPC__V_CMPX_GE_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_I32 class methods ---
-
-    Inst_VOPC__V_CMPX_T_I32::Inst_VOPC__V_CMPX_T_I32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_I32
-
-    Inst_VOPC__V_CMPX_T_I32::~Inst_VOPC__V_CMPX_T_I32()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_F_U32::Inst_VOPC__V_CMPX_F_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_U32
-
-    Inst_VOPC__V_CMPX_F_U32::~Inst_VOPC__V_CMPX_F_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_U32::Inst_VOPC__V_CMPX_LT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_U32
-
-    Inst_VOPC__V_CMPX_LT_U32::~Inst_VOPC__V_CMPX_LT_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_U32::Inst_VOPC__V_CMPX_EQ_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_U32
-
-    Inst_VOPC__V_CMPX_EQ_U32::~Inst_VOPC__V_CMPX_EQ_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_U32::Inst_VOPC__V_CMPX_LE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_U32
-
-    Inst_VOPC__V_CMPX_LE_U32::~Inst_VOPC__V_CMPX_LE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_U32::Inst_VOPC__V_CMPX_GT_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_U32
-
-    Inst_VOPC__V_CMPX_GT_U32::~Inst_VOPC__V_CMPX_GT_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_U32::Inst_VOPC__V_CMPX_NE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_U32
-
-    Inst_VOPC__V_CMPX_NE_U32::~Inst_VOPC__V_CMPX_NE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_U32::Inst_VOPC__V_CMPX_GE_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_U32
-
-    Inst_VOPC__V_CMPX_GE_U32::~Inst_VOPC__V_CMPX_GE_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_U32 class methods ---
-
-    Inst_VOPC__V_CMPX_T_U32::Inst_VOPC__V_CMPX_T_U32(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_U32
-
-    Inst_VOPC__V_CMPX_T_U32::~Inst_VOPC__V_CMPX_T_U32()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_I64 class methods ---
-
-    Inst_VOPC__V_CMP_F_I64::Inst_VOPC__V_CMP_F_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_I64
-
-    Inst_VOPC__V_CMP_F_I64::~Inst_VOPC__V_CMP_F_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_I64 class methods ---
-
-    Inst_VOPC__V_CMP_LT_I64::Inst_VOPC__V_CMP_LT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_I64
-
-    Inst_VOPC__V_CMP_LT_I64::~Inst_VOPC__V_CMP_LT_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_I64 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_I64::Inst_VOPC__V_CMP_EQ_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_I64
-
-    Inst_VOPC__V_CMP_EQ_I64::~Inst_VOPC__V_CMP_EQ_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_I64 class methods ---
-
-    Inst_VOPC__V_CMP_LE_I64::Inst_VOPC__V_CMP_LE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_I64
-
-    Inst_VOPC__V_CMP_LE_I64::~Inst_VOPC__V_CMP_LE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_I64 class methods ---
-
-    Inst_VOPC__V_CMP_GT_I64::Inst_VOPC__V_CMP_GT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_I64
-
-    Inst_VOPC__V_CMP_GT_I64::~Inst_VOPC__V_CMP_GT_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_I64 class methods ---
-
-    Inst_VOPC__V_CMP_NE_I64::Inst_VOPC__V_CMP_NE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_I64
-
-    Inst_VOPC__V_CMP_NE_I64::~Inst_VOPC__V_CMP_NE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_I64 class methods ---
-
-    Inst_VOPC__V_CMP_GE_I64::Inst_VOPC__V_CMP_GE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_I64
-
-    Inst_VOPC__V_CMP_GE_I64::~Inst_VOPC__V_CMP_GE_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_I64 class methods ---
-
-    Inst_VOPC__V_CMP_T_I64::Inst_VOPC__V_CMP_T_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_i64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_I64
-
-    Inst_VOPC__V_CMP_T_I64::~Inst_VOPC__V_CMP_T_I64()
-    {
-    } // ~Inst_VOPC__V_CMP_T_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_F_U64 class methods ---
-
-    Inst_VOPC__V_CMP_F_U64::Inst_VOPC__V_CMP_F_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_f_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_F_U64
-
-    Inst_VOPC__V_CMP_F_U64::~Inst_VOPC__V_CMP_F_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_F_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LT_U64 class methods ---
-
-    Inst_VOPC__V_CMP_LT_U64::Inst_VOPC__V_CMP_LT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_lt_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LT_U64
-
-    Inst_VOPC__V_CMP_LT_U64::~Inst_VOPC__V_CMP_LT_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_LT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_EQ_U64 class methods ---
-
-    Inst_VOPC__V_CMP_EQ_U64::Inst_VOPC__V_CMP_EQ_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_eq_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_EQ_U64
-
-    Inst_VOPC__V_CMP_EQ_U64::~Inst_VOPC__V_CMP_EQ_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_EQ_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_LE_U64 class methods ---
-
-    Inst_VOPC__V_CMP_LE_U64::Inst_VOPC__V_CMP_LE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_le_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_LE_U64
-
-    Inst_VOPC__V_CMP_LE_U64::~Inst_VOPC__V_CMP_LE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_LE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GT_U64 class methods ---
-
-    Inst_VOPC__V_CMP_GT_U64::Inst_VOPC__V_CMP_GT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_gt_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GT_U64
-
-    Inst_VOPC__V_CMP_GT_U64::~Inst_VOPC__V_CMP_GT_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_GT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_NE_U64 class methods ---
-
-    Inst_VOPC__V_CMP_NE_U64::Inst_VOPC__V_CMP_NE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ne_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_NE_U64
-
-    Inst_VOPC__V_CMP_NE_U64::~Inst_VOPC__V_CMP_NE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_NE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_GE_U64 class methods ---
-
-    Inst_VOPC__V_CMP_GE_U64::Inst_VOPC__V_CMP_GE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_ge_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_GE_U64
-
-    Inst_VOPC__V_CMP_GE_U64::~Inst_VOPC__V_CMP_GE_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_GE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMP_T_U64 class methods ---
-
-    Inst_VOPC__V_CMP_T_U64::Inst_VOPC__V_CMP_T_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmp_t_u64")
-    {
-        setFlag(ALU);
-    } // Inst_VOPC__V_CMP_T_U64
-
-    Inst_VOPC__V_CMP_T_U64::~Inst_VOPC__V_CMP_T_U64()
-    {
-    } // ~Inst_VOPC__V_CMP_T_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_F_I64::Inst_VOPC__V_CMPX_F_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_I64
-
-    Inst_VOPC__V_CMPX_F_I64::~Inst_VOPC__V_CMPX_F_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_I64::Inst_VOPC__V_CMPX_LT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_I64
-
-    Inst_VOPC__V_CMPX_LT_I64::~Inst_VOPC__V_CMPX_LT_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_I64::Inst_VOPC__V_CMPX_EQ_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_I64
-
-    Inst_VOPC__V_CMPX_EQ_I64::~Inst_VOPC__V_CMPX_EQ_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_I64::Inst_VOPC__V_CMPX_LE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_I64
-
-    Inst_VOPC__V_CMPX_LE_I64::~Inst_VOPC__V_CMPX_LE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_I64::Inst_VOPC__V_CMPX_GT_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_I64
-
-    Inst_VOPC__V_CMPX_GT_I64::~Inst_VOPC__V_CMPX_GT_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_I64::Inst_VOPC__V_CMPX_NE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_I64
-
-    Inst_VOPC__V_CMPX_NE_I64::~Inst_VOPC__V_CMPX_NE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_I64::Inst_VOPC__V_CMPX_GE_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_I64
-
-    Inst_VOPC__V_CMPX_GE_I64::~Inst_VOPC__V_CMPX_GE_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_I64 class methods ---
-
-    Inst_VOPC__V_CMPX_T_I64::Inst_VOPC__V_CMPX_T_I64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_i64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_I64
-
-    Inst_VOPC__V_CMPX_T_I64::~Inst_VOPC__V_CMPX_T_I64()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_F_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_F_U64::Inst_VOPC__V_CMPX_F_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_f_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_F_U64
-
-    Inst_VOPC__V_CMPX_F_U64::~Inst_VOPC__V_CMPX_F_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_F_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LT_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_LT_U64::Inst_VOPC__V_CMPX_LT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_lt_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LT_U64
-
-    Inst_VOPC__V_CMPX_LT_U64::~Inst_VOPC__V_CMPX_LT_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_EQ_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_EQ_U64::Inst_VOPC__V_CMPX_EQ_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_eq_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_EQ_U64
-
-    Inst_VOPC__V_CMPX_EQ_U64::~Inst_VOPC__V_CMPX_EQ_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_EQ_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_LE_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_LE_U64::Inst_VOPC__V_CMPX_LE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_le_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_LE_U64
-
-    Inst_VOPC__V_CMPX_LE_U64::~Inst_VOPC__V_CMPX_LE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_LE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GT_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_GT_U64::Inst_VOPC__V_CMPX_GT_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_gt_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GT_U64
-
-    Inst_VOPC__V_CMPX_GT_U64::~Inst_VOPC__V_CMPX_GT_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_NE_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_NE_U64::Inst_VOPC__V_CMPX_NE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ne_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_NE_U64
-
-    Inst_VOPC__V_CMPX_NE_U64::~Inst_VOPC__V_CMPX_NE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_NE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_GE_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_GE_U64::Inst_VOPC__V_CMPX_GE_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_ge_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_GE_U64
-
-    Inst_VOPC__V_CMPX_GE_U64::~Inst_VOPC__V_CMPX_GE_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_GE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VOPC__V_CMPX_T_U64 class methods ---
-
-    Inst_VOPC__V_CMPX_T_U64::Inst_VOPC__V_CMPX_T_U64(InFmt_VOPC *iFmt)
-        : Inst_VOPC(iFmt, "v_cmpx_t_u64")
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOPC__V_CMPX_T_U64
-
-    Inst_VOPC__V_CMPX_T_U64::~Inst_VOPC__V_CMPX_T_U64()
-    {
-    } // ~Inst_VOPC__V_CMPX_T_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOPC__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = vcc.rawData();
-        vcc.write();
-    } // execute
-    // --- Inst_VINTRP__V_INTERP_P1_F32 class methods ---
-
-    Inst_VINTRP__V_INTERP_P1_F32::Inst_VINTRP__V_INTERP_P1_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_p1_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_P1_F32
-
-    Inst_VINTRP__V_INTERP_P1_F32::~Inst_VINTRP__V_INTERP_P1_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_P1_F32
-
-    // --- description from .arch file ---
-    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S;
-    // if D == S then data corruption will occur.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VINTRP__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VINTRP__V_INTERP_P2_F32 class methods ---
-
-    Inst_VINTRP__V_INTERP_P2_F32::Inst_VINTRP__V_INTERP_P2_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_p2_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_P2_F32
-
-    Inst_VINTRP__V_INTERP_P2_F32::~Inst_VINTRP__V_INTERP_P2_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_P2_F32
-
-    // --- description from .arch file ---
-    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VINTRP__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VINTRP__V_INTERP_MOV_F32 class methods ---
-
-    Inst_VINTRP__V_INTERP_MOV_F32::Inst_VINTRP__V_INTERP_MOV_F32(
-          InFmt_VINTRP *iFmt)
-        : Inst_VINTRP(iFmt, "v_interp_mov_f32")
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VINTRP__V_INTERP_MOV_F32
-
-    Inst_VINTRP__V_INTERP_MOV_F32::~Inst_VINTRP__V_INTERP_MOV_F32()
-    {
-    } // ~Inst_VINTRP__V_INTERP_MOV_F32
-
-    // --- description from .arch file ---
-    // D.f = {P10,P20,P0}[S.u]; parameter load.
-    void
-    Inst_VINTRP__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_CLASS_F32 class methods ---
-
-    Inst_VOP3__V_CMP_CLASS_F32::Inst_VOP3__V_CMP_CLASS_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_class_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_CLASS_F32
-
-    Inst_VOP3__V_CMP_CLASS_F32::~Inst_VOP3__V_CMP_CLASS_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F32
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_CLASS_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_CLASS_F32::Inst_VOP3__V_CMPX_CLASS_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_class_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_CLASS_F32
-
-    Inst_VOP3__V_CMPX_CLASS_F32::~Inst_VOP3__V_CMPX_CLASS_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F32
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.f
-    // The function reports true if the floating point value is *any* of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane,  1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_CLASS_F64 class methods ---
-
-    Inst_VOP3__V_CMP_CLASS_F64::Inst_VOP3__V_CMP_CLASS_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_class_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_CLASS_F64
-
-    Inst_VOP3__V_CMP_CLASS_F64::~Inst_VOP3__V_CMP_CLASS_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F64
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_CLASS_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_CLASS_F64::Inst_VOP3__V_CMPX_CLASS_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_class_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_CLASS_F64
-
-    Inst_VOP3__V_CMPX_CLASS_F64::~Inst_VOP3__V_CMPX_CLASS_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F64
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // S0.d
-    // The function reports true if the floating point value is *any* of the
-    // numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
-                    // is NaN
-                    if (std::isnan(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 2)) {
-                    // is -infinity
-                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 3)) {
-                    // is -normal
-                    if (std::isnormal(src0[lane])
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 4)) {
-                    // is -denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 5)) {
-                    // is -zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 6)) {
-                    // is +zero
-                    if (std::fpclassify(src0[lane]) == FP_ZERO
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 7)) {
-                    // is +denormal
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 8)) {
-                    // is +normal
-                    if (std::isnormal(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-                if (bits(src1[lane], 9)) {
-                    // is +infinity
-                    if (std::isinf(src0[lane])
-                        && !std::signbit(src0[lane])) {
-                        sdst.setBit(lane, 1);
-                        continue;
-                    }
-                }
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_CLASS_F16 class methods ---
-
-    Inst_VOP3__V_CMP_CLASS_F16::Inst_VOP3__V_CMP_CLASS_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_class_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_CLASS_F16
-
-    Inst_VOP3__V_CMP_CLASS_F16::~Inst_VOP3__V_CMP_CLASS_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_CLASS_F16
-
-    // --- description from .arch file ---
-    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_CLASS_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_CLASS_F16::Inst_VOP3__V_CMPX_CLASS_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_class_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_CLASS_F16
-
-    Inst_VOP3__V_CMPX_CLASS_F16::~Inst_VOP3__V_CMPX_CLASS_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_CLASS_F16
-
-    // --- description from .arch file ---
-    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
-    // ---  S0.f16
-    // The function reports true if the floating point value is *any* of the
-    // ---  numeric types selected in S1.u according to the following list:
-    // S1.u[0] -- value is a signaling NaN.
-    // S1.u[1] -- value is a quiet NaN.
-    // S1.u[2] -- value is negative infinity.
-    // S1.u[3] -- value is a negative normal value.
-    // S1.u[4] -- value is a negative denormal value.
-    // S1.u[5] -- value is negative zero.
-    // S1.u[6] -- value is positive zero.
-    // S1.u[7] -- value is a positive denormal value.
-    // S1.u[8] -- value is a positive normal value.
-    // S1.u[9] -- value is positive infinity.
-    void
-    Inst_VOP3__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_F16 class methods ---
-
-    Inst_VOP3__V_CMP_F_F16::Inst_VOP3__V_CMP_F_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_F_F16
-
-    Inst_VOP3__V_CMP_F_F16::~Inst_VOP3__V_CMP_F_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_LT_F16::Inst_VOP3__V_CMP_LT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LT_F16
-
-    Inst_VOP3__V_CMP_LT_F16::~Inst_VOP3__V_CMP_LT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_F16 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_F16::Inst_VOP3__V_CMP_EQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_EQ_F16
-
-    Inst_VOP3__V_CMP_EQ_F16::~Inst_VOP3__V_CMP_EQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_LE_F16::Inst_VOP3__V_CMP_LE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LE_F16
-
-    Inst_VOP3__V_CMP_LE_F16::~Inst_VOP3__V_CMP_LE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_GT_F16::Inst_VOP3__V_CMP_GT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_GT_F16
-
-    Inst_VOP3__V_CMP_GT_F16::~Inst_VOP3__V_CMP_GT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LG_F16 class methods ---
-
-    Inst_VOP3__V_CMP_LG_F16::Inst_VOP3__V_CMP_LG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_LG_F16
-
-    Inst_VOP3__V_CMP_LG_F16::~Inst_VOP3__V_CMP_LG_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_GE_F16::Inst_VOP3__V_CMP_GE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_GE_F16
-
-    Inst_VOP3__V_CMP_GE_F16::~Inst_VOP3__V_CMP_GE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_O_F16 class methods ---
-
-    Inst_VOP3__V_CMP_O_F16::Inst_VOP3__V_CMP_O_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_o_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_O_F16
-
-    Inst_VOP3__V_CMP_O_F16::~Inst_VOP3__V_CMP_O_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_U_F16 class methods ---
-
-    Inst_VOP3__V_CMP_U_F16::Inst_VOP3__V_CMP_U_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_u_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_U_F16
-
-    Inst_VOP3__V_CMP_U_F16::~Inst_VOP3__V_CMP_U_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NGE_F16::Inst_VOP3__V_CMP_NGE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NGE_F16
-
-    Inst_VOP3__V_CMP_NGE_F16::~Inst_VOP3__V_CMP_NGE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLG_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NLG_F16::Inst_VOP3__V_CMP_NLG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLG_F16
-
-    Inst_VOP3__V_CMP_NLG_F16::~Inst_VOP3__V_CMP_NLG_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NGT_F16::Inst_VOP3__V_CMP_NGT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ngt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NGT_F16
-
-    Inst_VOP3__V_CMP_NGT_F16::~Inst_VOP3__V_CMP_NGT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLE_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NLE_F16::Inst_VOP3__V_CMP_NLE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nle_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLE_F16
-
-    Inst_VOP3__V_CMP_NLE_F16::~Inst_VOP3__V_CMP_NLE_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NEQ_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NEQ_F16::Inst_VOP3__V_CMP_NEQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_neq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NEQ_F16
-
-    Inst_VOP3__V_CMP_NEQ_F16::~Inst_VOP3__V_CMP_NEQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLT_F16 class methods ---
-
-    Inst_VOP3__V_CMP_NLT_F16::Inst_VOP3__V_CMP_NLT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_NLT_F16
-
-    Inst_VOP3__V_CMP_NLT_F16::~Inst_VOP3__V_CMP_NLT_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMP_TRU_F16 class methods ---
-
-    Inst_VOP3__V_CMP_TRU_F16::Inst_VOP3__V_CMP_TRU_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_tru_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CMP_TRU_F16
-
-    Inst_VOP3__V_CMP_TRU_F16::~Inst_VOP3__V_CMP_TRU_F16()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_F_F16::Inst_VOP3__V_CMPX_F_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_F16
-
-    Inst_VOP3__V_CMPX_F_F16::~Inst_VOP3__V_CMPX_F_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_F16::Inst_VOP3__V_CMPX_LT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_F16
-
-    Inst_VOP3__V_CMPX_LT_F16::~Inst_VOP3__V_CMPX_LT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_F16::Inst_VOP3__V_CMPX_EQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_F16
-
-    Inst_VOP3__V_CMPX_EQ_F16::~Inst_VOP3__V_CMPX_EQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_F16::Inst_VOP3__V_CMPX_LE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_F16
-
-    Inst_VOP3__V_CMPX_LE_F16::~Inst_VOP3__V_CMPX_LE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_F16::Inst_VOP3__V_CMPX_GT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_F16
-
-    Inst_VOP3__V_CMPX_GT_F16::~Inst_VOP3__V_CMPX_GT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LG_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_LG_F16::Inst_VOP3__V_CMPX_LG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LG_F16
-
-    Inst_VOP3__V_CMPX_LG_F16::~Inst_VOP3__V_CMPX_LG_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_F16::Inst_VOP3__V_CMPX_GE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_F16
-
-    Inst_VOP3__V_CMPX_GE_F16::~Inst_VOP3__V_CMPX_GE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_O_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_O_F16::Inst_VOP3__V_CMPX_O_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_o_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_O_F16
-
-    Inst_VOP3__V_CMPX_O_F16::~Inst_VOP3__V_CMPX_O_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_U_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_U_F16::Inst_VOP3__V_CMPX_U_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_u_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_U_F16
-
-    Inst_VOP3__V_CMPX_U_F16::~Inst_VOP3__V_CMPX_U_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NGE_F16::Inst_VOP3__V_CMPX_NGE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nge_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGE_F16
-
-    Inst_VOP3__V_CMPX_NGE_F16::~Inst_VOP3__V_CMPX_NGE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLG_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NLG_F16::Inst_VOP3__V_CMPX_NLG_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLG_F16
-
-    Inst_VOP3__V_CMPX_NLG_F16::~Inst_VOP3__V_CMPX_NLG_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NGT_F16::Inst_VOP3__V_CMPX_NGT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGT_F16
-
-    Inst_VOP3__V_CMPX_NGT_F16::~Inst_VOP3__V_CMPX_NGT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLE_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NLE_F16::Inst_VOP3__V_CMPX_NLE_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nle_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLE_F16
-
-    Inst_VOP3__V_CMPX_NLE_F16::~Inst_VOP3__V_CMPX_NLE_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NEQ_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NEQ_F16::Inst_VOP3__V_CMPX_NEQ_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_neq_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NEQ_F16
-
-    Inst_VOP3__V_CMPX_NEQ_F16::~Inst_VOP3__V_CMPX_NEQ_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLT_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_NLT_F16::Inst_VOP3__V_CMPX_NLT_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLT_F16
-
-    Inst_VOP3__V_CMPX_NLT_F16::~Inst_VOP3__V_CMPX_NLT_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_TRU_F16 class methods ---
-
-    Inst_VOP3__V_CMPX_TRU_F16::Inst_VOP3__V_CMPX_TRU_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_tru_f16", true)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_TRU_F16
-
-    Inst_VOP3__V_CMPX_TRU_F16::~Inst_VOP3__V_CMPX_TRU_F16()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_F32 class methods ---
-
-    Inst_VOP3__V_CMP_F_F32::Inst_VOP3__V_CMP_F_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_F_F32
-
-    Inst_VOP3__V_CMP_F_F32::~Inst_VOP3__V_CMP_F_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_LT_F32::Inst_VOP3__V_CMP_LT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LT_F32
-
-    Inst_VOP3__V_CMP_LT_F32::~Inst_VOP3__V_CMP_LT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_F32 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_F32::Inst_VOP3__V_CMP_EQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_EQ_F32
-
-    Inst_VOP3__V_CMP_EQ_F32::~Inst_VOP3__V_CMP_EQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_LE_F32::Inst_VOP3__V_CMP_LE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LE_F32
-
-    Inst_VOP3__V_CMP_LE_F32::~Inst_VOP3__V_CMP_LE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_GT_F32::Inst_VOP3__V_CMP_GT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_GT_F32
-
-    Inst_VOP3__V_CMP_GT_F32::~Inst_VOP3__V_CMP_GT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LG_F32 class methods ---
-
-    Inst_VOP3__V_CMP_LG_F32::Inst_VOP3__V_CMP_LG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_LG_F32
-
-    Inst_VOP3__V_CMP_LG_F32::~Inst_VOP3__V_CMP_LG_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_GE_F32::Inst_VOP3__V_CMP_GE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_GE_F32
-
-    Inst_VOP3__V_CMP_GE_F32::~Inst_VOP3__V_CMP_GE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_O_F32 class methods ---
-
-    Inst_VOP3__V_CMP_O_F32::Inst_VOP3__V_CMP_O_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_o_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_O_F32
-
-    Inst_VOP3__V_CMP_O_F32::~Inst_VOP3__V_CMP_O_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_U_F32 class methods ---
-
-    Inst_VOP3__V_CMP_U_F32::Inst_VOP3__V_CMP_U_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_u_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_U_F32
-
-    Inst_VOP3__V_CMP_U_F32::~Inst_VOP3__V_CMP_U_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NGE_F32::Inst_VOP3__V_CMP_NGE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NGE_F32
-
-    Inst_VOP3__V_CMP_NGE_F32::~Inst_VOP3__V_CMP_NGE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLG_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NLG_F32::Inst_VOP3__V_CMP_NLG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLG_F32
-
-    Inst_VOP3__V_CMP_NLG_F32::~Inst_VOP3__V_CMP_NLG_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NGT_F32::Inst_VOP3__V_CMP_NGT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ngt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NGT_F32
-
-    Inst_VOP3__V_CMP_NGT_F32::~Inst_VOP3__V_CMP_NGT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLE_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NLE_F32::Inst_VOP3__V_CMP_NLE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nle_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLE_F32
-
-    Inst_VOP3__V_CMP_NLE_F32::~Inst_VOP3__V_CMP_NLE_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NEQ_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NEQ_F32::Inst_VOP3__V_CMP_NEQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_neq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NEQ_F32
-
-    Inst_VOP3__V_CMP_NEQ_F32::~Inst_VOP3__V_CMP_NEQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLT_F32 class methods ---
-
-    Inst_VOP3__V_CMP_NLT_F32::Inst_VOP3__V_CMP_NLT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_NLT_F32
-
-    Inst_VOP3__V_CMP_NLT_F32::~Inst_VOP3__V_CMP_NLT_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_TRU_F32 class methods ---
-
-    Inst_VOP3__V_CMP_TRU_F32::Inst_VOP3__V_CMP_TRU_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_tru_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CMP_TRU_F32
-
-    Inst_VOP3__V_CMP_TRU_F32::~Inst_VOP3__V_CMP_TRU_F32()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_F_F32::Inst_VOP3__V_CMPX_F_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_F32
-
-    Inst_VOP3__V_CMPX_F_F32::~Inst_VOP3__V_CMPX_F_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_F32::Inst_VOP3__V_CMPX_LT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_F32
-
-    Inst_VOP3__V_CMPX_LT_F32::~Inst_VOP3__V_CMPX_LT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_F32::Inst_VOP3__V_CMPX_EQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_F32
-
-    Inst_VOP3__V_CMPX_EQ_F32::~Inst_VOP3__V_CMPX_EQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_F32::Inst_VOP3__V_CMPX_LE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_F32
-
-    Inst_VOP3__V_CMPX_LE_F32::~Inst_VOP3__V_CMPX_LE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_F32::Inst_VOP3__V_CMPX_GT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_F32
-
-    Inst_VOP3__V_CMPX_GT_F32::~Inst_VOP3__V_CMPX_GT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LG_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_LG_F32::Inst_VOP3__V_CMPX_LG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LG_F32
-
-    Inst_VOP3__V_CMPX_LG_F32::~Inst_VOP3__V_CMPX_LG_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_F32::Inst_VOP3__V_CMPX_GE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_F32
-
-    Inst_VOP3__V_CMPX_GE_F32::~Inst_VOP3__V_CMPX_GE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_O_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_O_F32::Inst_VOP3__V_CMPX_O_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_o_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_O_F32
-
-    Inst_VOP3__V_CMPX_O_F32::~Inst_VOP3__V_CMPX_O_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_U_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_U_F32::Inst_VOP3__V_CMPX_U_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_u_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_U_F32
-
-    Inst_VOP3__V_CMPX_U_F32::~Inst_VOP3__V_CMPX_U_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                        || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NGE_F32::Inst_VOP3__V_CMPX_NGE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nge_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGE_F32
-
-    Inst_VOP3__V_CMPX_NGE_F32::~Inst_VOP3__V_CMPX_NGE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLG_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NLG_F32::Inst_VOP3__V_CMPX_NLG_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLG_F32
-
-    Inst_VOP3__V_CMPX_NLG_F32::~Inst_VOP3__V_CMPX_NLG_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NGT_F32::Inst_VOP3__V_CMPX_NGT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGT_F32
-
-    Inst_VOP3__V_CMPX_NGT_F32::~Inst_VOP3__V_CMPX_NGT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLE_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NLE_F32::Inst_VOP3__V_CMPX_NLE_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nle_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLE_F32
-
-    Inst_VOP3__V_CMPX_NLE_F32::~Inst_VOP3__V_CMPX_NLE_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NEQ_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NEQ_F32::Inst_VOP3__V_CMPX_NEQ_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_neq_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NEQ_F32
-
-    Inst_VOP3__V_CMPX_NEQ_F32::~Inst_VOP3__V_CMPX_NEQ_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLT_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_NLT_F32::Inst_VOP3__V_CMPX_NLT_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLT_F32
-
-    Inst_VOP3__V_CMPX_NLT_F32::~Inst_VOP3__V_CMPX_NLT_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_TRU_F32 class methods ---
-
-    Inst_VOP3__V_CMPX_TRU_F32::Inst_VOP3__V_CMPX_TRU_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_tru_f32", true)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_TRU_F32
-
-    Inst_VOP3__V_CMPX_TRU_F32::~Inst_VOP3__V_CMPX_TRU_F32()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_F64 class methods ---
-
-    Inst_VOP3__V_CMP_F_F64::Inst_VOP3__V_CMP_F_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_F_F64
-
-    Inst_VOP3__V_CMP_F_F64::~Inst_VOP3__V_CMP_F_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_LT_F64::Inst_VOP3__V_CMP_LT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LT_F64
-
-    Inst_VOP3__V_CMP_LT_F64::~Inst_VOP3__V_CMP_LT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_F64 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_F64::Inst_VOP3__V_CMP_EQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_EQ_F64
-
-    Inst_VOP3__V_CMP_EQ_F64::~Inst_VOP3__V_CMP_EQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_LE_F64::Inst_VOP3__V_CMP_LE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LE_F64
-
-    Inst_VOP3__V_CMP_LE_F64::~Inst_VOP3__V_CMP_LE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_GT_F64::Inst_VOP3__V_CMP_GT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_GT_F64
-
-    Inst_VOP3__V_CMP_GT_F64::~Inst_VOP3__V_CMP_GT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LG_F64 class methods ---
-
-    Inst_VOP3__V_CMP_LG_F64::Inst_VOP3__V_CMP_LG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_LG_F64
-
-    Inst_VOP3__V_CMP_LG_F64::~Inst_VOP3__V_CMP_LG_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_LG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_GE_F64::Inst_VOP3__V_CMP_GE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_GE_F64
-
-    Inst_VOP3__V_CMP_GE_F64::~Inst_VOP3__V_CMP_GE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_O_F64 class methods ---
-
-    Inst_VOP3__V_CMP_O_F64::Inst_VOP3__V_CMP_O_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_o_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_O_F64
-
-    Inst_VOP3__V_CMP_O_F64::~Inst_VOP3__V_CMP_O_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_O_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_U_F64 class methods ---
-
-    Inst_VOP3__V_CMP_U_F64::Inst_VOP3__V_CMP_U_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_u_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_U_F64
-
-    Inst_VOP3__V_CMP_U_F64::~Inst_VOP3__V_CMP_U_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_U_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NGE_F64::Inst_VOP3__V_CMP_NGE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NGE_F64
-
-    Inst_VOP3__V_CMP_NGE_F64::~Inst_VOP3__V_CMP_NGE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NGE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLG_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NLG_F64::Inst_VOP3__V_CMP_NLG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLG_F64
-
-    Inst_VOP3__V_CMP_NLG_F64::~Inst_VOP3__V_CMP_NLG_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLG_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NGT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NGT_F64::Inst_VOP3__V_CMP_NGT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ngt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NGT_F64
-
-    Inst_VOP3__V_CMP_NGT_F64::~Inst_VOP3__V_CMP_NGT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NGT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLE_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NLE_F64::Inst_VOP3__V_CMP_NLE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nle_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLE_F64
-
-    Inst_VOP3__V_CMP_NLE_F64::~Inst_VOP3__V_CMP_NLE_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLE_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NEQ_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NEQ_F64::Inst_VOP3__V_CMP_NEQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_neq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NEQ_F64
-
-    Inst_VOP3__V_CMP_NEQ_F64::~Inst_VOP3__V_CMP_NEQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NEQ_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NLT_F64 class methods ---
-
-    Inst_VOP3__V_CMP_NLT_F64::Inst_VOP3__V_CMP_NLT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_nlt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_NLT_F64
-
-    Inst_VOP3__V_CMP_NLT_F64::~Inst_VOP3__V_CMP_NLT_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_NLT_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_TRU_F64 class methods ---
-
-    Inst_VOP3__V_CMP_TRU_F64::Inst_VOP3__V_CMP_TRU_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_tru_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CMP_TRU_F64
-
-    Inst_VOP3__V_CMP_TRU_F64::~Inst_VOP3__V_CMP_TRU_F64()
-    {
-    } // ~Inst_VOP3__V_CMP_TRU_F64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_F_F64::Inst_VOP3__V_CMPX_F_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_F64
-
-    Inst_VOP3__V_CMPX_F_F64::~Inst_VOP3__V_CMPX_F_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_F64::Inst_VOP3__V_CMPX_LT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_F64
-
-    Inst_VOP3__V_CMPX_LT_F64::~Inst_VOP3__V_CMPX_LT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_F64::Inst_VOP3__V_CMPX_EQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_F64
-
-    Inst_VOP3__V_CMPX_EQ_F64::~Inst_VOP3__V_CMPX_EQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_F64::Inst_VOP3__V_CMPX_LE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_F64
-
-    Inst_VOP3__V_CMPX_LE_F64::~Inst_VOP3__V_CMPX_LE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_F64::Inst_VOP3__V_CMPX_GT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_F64
-
-    Inst_VOP3__V_CMPX_GT_F64::~Inst_VOP3__V_CMPX_GT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LG_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_LG_F64::Inst_VOP3__V_CMPX_LG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LG_F64
-
-    Inst_VOP3__V_CMPX_LG_F64::~Inst_VOP3__V_CMPX_LG_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_F64::Inst_VOP3__V_CMPX_GE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_F64
-
-    Inst_VOP3__V_CMPX_GE_F64::~Inst_VOP3__V_CMPX_GE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_O_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_O_F64::Inst_VOP3__V_CMPX_O_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_o_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_O_F64
-
-    Inst_VOP3__V_CMPX_O_F64::~Inst_VOP3__V_CMPX_O_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_O_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (!std::isnan(src0[lane])
-                    && !std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_U_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_U_F64::Inst_VOP3__V_CMPX_U_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_u_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_U_F64
-
-    Inst_VOP3__V_CMPX_U_F64::~Inst_VOP3__V_CMPX_U_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_U_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
-    // encoding.
-    void
-    Inst_VOP3__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, (std::isnan(src0[lane])
-                    || std::isnan(src1[lane])) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NGE_F64::Inst_VOP3__V_CMPX_NGE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nge_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGE_F64
-
-    Inst_VOP3__V_CMPX_NGE_F64::~Inst_VOP3__V_CMPX_NGE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLG_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NLG_F64::Inst_VOP3__V_CMPX_NLG_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLG_F64
-
-    Inst_VOP3__V_CMPX_NLG_F64::~Inst_VOP3__V_CMPX_NLG_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLG_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]
-                    || src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NGT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NGT_F64::Inst_VOP3__V_CMPX_NGT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NGT_F64
-
-    Inst_VOP3__V_CMPX_NGT_F64::~Inst_VOP3__V_CMPX_NGT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NGT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLE_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NLE_F64::Inst_VOP3__V_CMPX_NLE_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nle_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLE_F64
-
-    Inst_VOP3__V_CMPX_NLE_F64::~Inst_VOP3__V_CMPX_NLE_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLE_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NEQ_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NEQ_F64::Inst_VOP3__V_CMPX_NEQ_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_neq_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NEQ_F64
-
-    Inst_VOP3__V_CMPX_NEQ_F64::~Inst_VOP3__V_CMPX_NEQ_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NEQ_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NLT_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_NLT_F64::Inst_VOP3__V_CMPX_NLT_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NLT_F64
-
-    Inst_VOP3__V_CMPX_NLT_F64::~Inst_VOP3__V_CMPX_NLT_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NLT_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_TRU_F64 class methods ---
-
-    Inst_VOP3__V_CMPX_TRU_F64::Inst_VOP3__V_CMPX_TRU_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_tru_f64", true)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_TRU_F64
-
-    Inst_VOP3__V_CMPX_TRU_F64::~Inst_VOP3__V_CMPX_TRU_F64()
-    {
-    } // ~Inst_VOP3__V_CMPX_TRU_F64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_I16 class methods ---
-
-    Inst_VOP3__V_CMP_F_I16::Inst_VOP3__V_CMP_F_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I16
-
-    Inst_VOP3__V_CMP_F_I16::~Inst_VOP3__V_CMP_F_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_I16 class methods ---
-
-    Inst_VOP3__V_CMP_LT_I16::Inst_VOP3__V_CMP_LT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I16
-
-    Inst_VOP3__V_CMP_LT_I16::~Inst_VOP3__V_CMP_LT_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_I16 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_I16::Inst_VOP3__V_CMP_EQ_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I16
-
-    Inst_VOP3__V_CMP_EQ_I16::~Inst_VOP3__V_CMP_EQ_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_I16 class methods ---
-
-    Inst_VOP3__V_CMP_LE_I16::Inst_VOP3__V_CMP_LE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I16
-
-    Inst_VOP3__V_CMP_LE_I16::~Inst_VOP3__V_CMP_LE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_I16 class methods ---
-
-    Inst_VOP3__V_CMP_GT_I16::Inst_VOP3__V_CMP_GT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I16
-
-    Inst_VOP3__V_CMP_GT_I16::~Inst_VOP3__V_CMP_GT_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_I16 class methods ---
-
-    Inst_VOP3__V_CMP_NE_I16::Inst_VOP3__V_CMP_NE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I16
-
-    Inst_VOP3__V_CMP_NE_I16::~Inst_VOP3__V_CMP_NE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_I16 class methods ---
-
-    Inst_VOP3__V_CMP_GE_I16::Inst_VOP3__V_CMP_GE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I16
-
-    Inst_VOP3__V_CMP_GE_I16::~Inst_VOP3__V_CMP_GE_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_I16 class methods ---
-
-    Inst_VOP3__V_CMP_T_I16::Inst_VOP3__V_CMP_T_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_i16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I16
-
-    Inst_VOP3__V_CMP_T_I16::~Inst_VOP3__V_CMP_T_I16()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_U16 class methods ---
-
-    Inst_VOP3__V_CMP_F_U16::Inst_VOP3__V_CMP_F_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U16
-
-    Inst_VOP3__V_CMP_F_U16::~Inst_VOP3__V_CMP_F_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_U16 class methods ---
-
-    Inst_VOP3__V_CMP_LT_U16::Inst_VOP3__V_CMP_LT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U16
-
-    Inst_VOP3__V_CMP_LT_U16::~Inst_VOP3__V_CMP_LT_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_U16 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_U16::Inst_VOP3__V_CMP_EQ_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U16
-
-    Inst_VOP3__V_CMP_EQ_U16::~Inst_VOP3__V_CMP_EQ_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_U16 class methods ---
-
-    Inst_VOP3__V_CMP_LE_U16::Inst_VOP3__V_CMP_LE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U16
-
-    Inst_VOP3__V_CMP_LE_U16::~Inst_VOP3__V_CMP_LE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_U16 class methods ---
-
-    Inst_VOP3__V_CMP_GT_U16::Inst_VOP3__V_CMP_GT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U16
-
-    Inst_VOP3__V_CMP_GT_U16::~Inst_VOP3__V_CMP_GT_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_U16 class methods ---
-
-    Inst_VOP3__V_CMP_NE_U16::Inst_VOP3__V_CMP_NE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U16
-
-    Inst_VOP3__V_CMP_NE_U16::~Inst_VOP3__V_CMP_NE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_U16 class methods ---
-
-    Inst_VOP3__V_CMP_GE_U16::Inst_VOP3__V_CMP_GE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U16
-
-    Inst_VOP3__V_CMP_GE_U16::~Inst_VOP3__V_CMP_GE_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_U16 class methods ---
-
-    Inst_VOP3__V_CMP_T_U16::Inst_VOP3__V_CMP_T_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_u16", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U16
-
-    Inst_VOP3__V_CMP_T_U16::~Inst_VOP3__V_CMP_T_U16()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U16
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_F_I16::Inst_VOP3__V_CMPX_F_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_I16
-
-    Inst_VOP3__V_CMPX_F_I16::~Inst_VOP3__V_CMPX_F_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_I16::Inst_VOP3__V_CMPX_LT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_I16
-
-    Inst_VOP3__V_CMPX_LT_I16::~Inst_VOP3__V_CMPX_LT_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_I16::Inst_VOP3__V_CMPX_EQ_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_I16
-
-    Inst_VOP3__V_CMPX_EQ_I16::~Inst_VOP3__V_CMPX_EQ_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_I16::Inst_VOP3__V_CMPX_LE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_I16
-
-    Inst_VOP3__V_CMPX_LE_I16::~Inst_VOP3__V_CMPX_LE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_I16::Inst_VOP3__V_CMPX_GT_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_I16
-
-    Inst_VOP3__V_CMPX_GT_I16::~Inst_VOP3__V_CMPX_GT_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_I16::Inst_VOP3__V_CMPX_NE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_I16
-
-    Inst_VOP3__V_CMPX_NE_I16::~Inst_VOP3__V_CMPX_NE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_I16::Inst_VOP3__V_CMPX_GE_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_I16
-
-    Inst_VOP3__V_CMPX_GE_I16::~Inst_VOP3__V_CMPX_GE_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_I16 class methods ---
-
-    Inst_VOP3__V_CMPX_T_I16::Inst_VOP3__V_CMPX_T_I16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_i16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_I16
-
-    Inst_VOP3__V_CMPX_T_I16::~Inst_VOP3__V_CMPX_T_I16()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_F_U16::Inst_VOP3__V_CMPX_F_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_U16
-
-    Inst_VOP3__V_CMPX_F_U16::~Inst_VOP3__V_CMPX_F_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_U16::Inst_VOP3__V_CMPX_LT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_U16
-
-    Inst_VOP3__V_CMPX_LT_U16::~Inst_VOP3__V_CMPX_LT_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_U16::Inst_VOP3__V_CMPX_EQ_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_U16
-
-    Inst_VOP3__V_CMPX_EQ_U16::~Inst_VOP3__V_CMPX_EQ_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_U16::Inst_VOP3__V_CMPX_LE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_U16
-
-    Inst_VOP3__V_CMPX_LE_U16::~Inst_VOP3__V_CMPX_LE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_U16::Inst_VOP3__V_CMPX_GT_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_U16
-
-    Inst_VOP3__V_CMPX_GT_U16::~Inst_VOP3__V_CMPX_GT_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_U16::Inst_VOP3__V_CMPX_NE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_U16
-
-    Inst_VOP3__V_CMPX_NE_U16::~Inst_VOP3__V_CMPX_NE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_U16::Inst_VOP3__V_CMPX_GE_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_U16
-
-    Inst_VOP3__V_CMPX_GE_U16::~Inst_VOP3__V_CMPX_GE_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_U16 class methods ---
-
-    Inst_VOP3__V_CMPX_T_U16::Inst_VOP3__V_CMPX_T_U16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_u16", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_U16
-
-    Inst_VOP3__V_CMPX_T_U16::~Inst_VOP3__V_CMPX_T_U16()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U16
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_I32 class methods ---
-
-    Inst_VOP3__V_CMP_F_I32::Inst_VOP3__V_CMP_F_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I32
-
-    Inst_VOP3__V_CMP_F_I32::~Inst_VOP3__V_CMP_F_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_I32 class methods ---
-
-    Inst_VOP3__V_CMP_LT_I32::Inst_VOP3__V_CMP_LT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I32
-
-    Inst_VOP3__V_CMP_LT_I32::~Inst_VOP3__V_CMP_LT_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_I32 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_I32::Inst_VOP3__V_CMP_EQ_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I32
-
-    Inst_VOP3__V_CMP_EQ_I32::~Inst_VOP3__V_CMP_EQ_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_I32 class methods ---
-
-    Inst_VOP3__V_CMP_LE_I32::Inst_VOP3__V_CMP_LE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I32
-
-    Inst_VOP3__V_CMP_LE_I32::~Inst_VOP3__V_CMP_LE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_I32 class methods ---
-
-    Inst_VOP3__V_CMP_GT_I32::Inst_VOP3__V_CMP_GT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I32
-
-    Inst_VOP3__V_CMP_GT_I32::~Inst_VOP3__V_CMP_GT_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_I32 class methods ---
-
-    Inst_VOP3__V_CMP_NE_I32::Inst_VOP3__V_CMP_NE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I32
-
-    Inst_VOP3__V_CMP_NE_I32::~Inst_VOP3__V_CMP_NE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_I32 class methods ---
-
-    Inst_VOP3__V_CMP_GE_I32::Inst_VOP3__V_CMP_GE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I32
-
-    Inst_VOP3__V_CMP_GE_I32::~Inst_VOP3__V_CMP_GE_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_I32 class methods ---
-
-    Inst_VOP3__V_CMP_T_I32::Inst_VOP3__V_CMP_T_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_i32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I32
-
-    Inst_VOP3__V_CMP_T_I32::~Inst_VOP3__V_CMP_T_I32()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_U32 class methods ---
-
-    Inst_VOP3__V_CMP_F_U32::Inst_VOP3__V_CMP_F_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U32
-
-    Inst_VOP3__V_CMP_F_U32::~Inst_VOP3__V_CMP_F_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_U32 class methods ---
-
-    Inst_VOP3__V_CMP_LT_U32::Inst_VOP3__V_CMP_LT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U32
-
-    Inst_VOP3__V_CMP_LT_U32::~Inst_VOP3__V_CMP_LT_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_U32 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_U32::Inst_VOP3__V_CMP_EQ_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U32
-
-    Inst_VOP3__V_CMP_EQ_U32::~Inst_VOP3__V_CMP_EQ_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_U32 class methods ---
-
-    Inst_VOP3__V_CMP_LE_U32::Inst_VOP3__V_CMP_LE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U32
-
-    Inst_VOP3__V_CMP_LE_U32::~Inst_VOP3__V_CMP_LE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_U32 class methods ---
-
-    Inst_VOP3__V_CMP_GT_U32::Inst_VOP3__V_CMP_GT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U32
-
-    Inst_VOP3__V_CMP_GT_U32::~Inst_VOP3__V_CMP_GT_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_U32 class methods ---
-
-    Inst_VOP3__V_CMP_NE_U32::Inst_VOP3__V_CMP_NE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U32
-
-    Inst_VOP3__V_CMP_NE_U32::~Inst_VOP3__V_CMP_NE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_U32 class methods ---
-
-    Inst_VOP3__V_CMP_GE_U32::Inst_VOP3__V_CMP_GE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U32
-
-    Inst_VOP3__V_CMP_GE_U32::~Inst_VOP3__V_CMP_GE_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_U32 class methods ---
-
-    Inst_VOP3__V_CMP_T_U32::Inst_VOP3__V_CMP_T_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_u32", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U32
-
-    Inst_VOP3__V_CMP_T_U32::~Inst_VOP3__V_CMP_T_U32()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U32
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_F_I32::Inst_VOP3__V_CMPX_F_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_I32
-
-    Inst_VOP3__V_CMPX_F_I32::~Inst_VOP3__V_CMPX_F_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_I32::Inst_VOP3__V_CMPX_LT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_I32
-
-    Inst_VOP3__V_CMPX_LT_I32::~Inst_VOP3__V_CMPX_LT_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_I32::Inst_VOP3__V_CMPX_EQ_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_I32
-
-    Inst_VOP3__V_CMPX_EQ_I32::~Inst_VOP3__V_CMPX_EQ_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_I32::Inst_VOP3__V_CMPX_LE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_I32
-
-    Inst_VOP3__V_CMPX_LE_I32::~Inst_VOP3__V_CMPX_LE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_I32::Inst_VOP3__V_CMPX_GT_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_I32
-
-    Inst_VOP3__V_CMPX_GT_I32::~Inst_VOP3__V_CMPX_GT_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_I32::Inst_VOP3__V_CMPX_NE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_I32
-
-    Inst_VOP3__V_CMPX_NE_I32::~Inst_VOP3__V_CMPX_NE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_I32::Inst_VOP3__V_CMPX_GE_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_I32
-
-    Inst_VOP3__V_CMPX_GE_I32::~Inst_VOP3__V_CMPX_GE_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_I32 class methods ---
-
-    Inst_VOP3__V_CMPX_T_I32::Inst_VOP3__V_CMPX_T_I32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_i32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_I32
-
-    Inst_VOP3__V_CMPX_T_I32::~Inst_VOP3__V_CMPX_T_I32()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_F_U32::Inst_VOP3__V_CMPX_F_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_U32
-
-    Inst_VOP3__V_CMPX_F_U32::~Inst_VOP3__V_CMPX_F_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_U32::Inst_VOP3__V_CMPX_LT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_U32
-
-    Inst_VOP3__V_CMPX_LT_U32::~Inst_VOP3__V_CMPX_LT_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_U32::Inst_VOP3__V_CMPX_EQ_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_U32
-
-    Inst_VOP3__V_CMPX_EQ_U32::~Inst_VOP3__V_CMPX_EQ_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_U32::Inst_VOP3__V_CMPX_LE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_U32
-
-    Inst_VOP3__V_CMPX_LE_U32::~Inst_VOP3__V_CMPX_LE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_U32::Inst_VOP3__V_CMPX_GT_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_U32
-
-    Inst_VOP3__V_CMPX_GT_U32::~Inst_VOP3__V_CMPX_GT_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_U32::Inst_VOP3__V_CMPX_NE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_U32
-
-    Inst_VOP3__V_CMPX_NE_U32::~Inst_VOP3__V_CMPX_NE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_U32::Inst_VOP3__V_CMPX_GE_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_U32
-
-    Inst_VOP3__V_CMPX_GE_U32::~Inst_VOP3__V_CMPX_GE_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_U32 class methods ---
-
-    Inst_VOP3__V_CMPX_T_U32::Inst_VOP3__V_CMPX_T_U32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_u32", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_U32
-
-    Inst_VOP3__V_CMPX_T_U32::~Inst_VOP3__V_CMPX_T_U32()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U32
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_I64 class methods ---
-
-    Inst_VOP3__V_CMP_F_I64::Inst_VOP3__V_CMP_F_I64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_I64
-
-    Inst_VOP3__V_CMP_F_I64::~Inst_VOP3__V_CMP_F_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_I64 class methods ---
-
-    Inst_VOP3__V_CMP_LT_I64::Inst_VOP3__V_CMP_LT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_I64
-
-    Inst_VOP3__V_CMP_LT_I64::~Inst_VOP3__V_CMP_LT_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_I64 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_I64::Inst_VOP3__V_CMP_EQ_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_I64
-
-    Inst_VOP3__V_CMP_EQ_I64::~Inst_VOP3__V_CMP_EQ_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_I64 class methods ---
-
-    Inst_VOP3__V_CMP_LE_I64::Inst_VOP3__V_CMP_LE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_I64
-
-    Inst_VOP3__V_CMP_LE_I64::~Inst_VOP3__V_CMP_LE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_I64 class methods ---
-
-    Inst_VOP3__V_CMP_GT_I64::Inst_VOP3__V_CMP_GT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_I64
-
-    Inst_VOP3__V_CMP_GT_I64::~Inst_VOP3__V_CMP_GT_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_I64 class methods ---
-
-    Inst_VOP3__V_CMP_NE_I64::Inst_VOP3__V_CMP_NE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_I64
-
-    Inst_VOP3__V_CMP_NE_I64::~Inst_VOP3__V_CMP_NE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_I64 class methods ---
-
-    Inst_VOP3__V_CMP_GE_I64::Inst_VOP3__V_CMP_GE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_I64
-
-    Inst_VOP3__V_CMP_GE_I64::~Inst_VOP3__V_CMP_GE_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_I64 class methods ---
-
-    Inst_VOP3__V_CMP_T_I64::Inst_VOP3__V_CMP_T_I64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_i64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_I64
-
-    Inst_VOP3__V_CMP_T_I64::~Inst_VOP3__V_CMP_T_I64()
-    {
-    } // ~Inst_VOP3__V_CMP_T_I64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_F_U64 class methods ---
-
-    Inst_VOP3__V_CMP_F_U64::Inst_VOP3__V_CMP_F_U64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_f_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_F_U64
-
-    Inst_VOP3__V_CMP_F_U64::~Inst_VOP3__V_CMP_F_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_F_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LT_U64 class methods ---
-
-    Inst_VOP3__V_CMP_LT_U64::Inst_VOP3__V_CMP_LT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_lt_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LT_U64
-
-    Inst_VOP3__V_CMP_LT_U64::~Inst_VOP3__V_CMP_LT_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_LT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_EQ_U64 class methods ---
-
-    Inst_VOP3__V_CMP_EQ_U64::Inst_VOP3__V_CMP_EQ_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_eq_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_EQ_U64
-
-    Inst_VOP3__V_CMP_EQ_U64::~Inst_VOP3__V_CMP_EQ_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_EQ_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_LE_U64 class methods ---
-
-    Inst_VOP3__V_CMP_LE_U64::Inst_VOP3__V_CMP_LE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_le_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_LE_U64
-
-    Inst_VOP3__V_CMP_LE_U64::~Inst_VOP3__V_CMP_LE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_LE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GT_U64 class methods ---
-
-    Inst_VOP3__V_CMP_GT_U64::Inst_VOP3__V_CMP_GT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_gt_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GT_U64
-
-    Inst_VOP3__V_CMP_GT_U64::~Inst_VOP3__V_CMP_GT_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_GT_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_NE_U64 class methods ---
-
-    Inst_VOP3__V_CMP_NE_U64::Inst_VOP3__V_CMP_NE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ne_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_NE_U64
-
-    Inst_VOP3__V_CMP_NE_U64::~Inst_VOP3__V_CMP_NE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_NE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_GE_U64 class methods ---
-
-    Inst_VOP3__V_CMP_GE_U64::Inst_VOP3__V_CMP_GE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_ge_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_GE_U64
-
-    Inst_VOP3__V_CMP_GE_U64::~Inst_VOP3__V_CMP_GE_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_GE_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMP_T_U64 class methods ---
-
-    Inst_VOP3__V_CMP_T_U64::Inst_VOP3__V_CMP_T_U64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmp_t_u64", true)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CMP_T_U64
-
-    Inst_VOP3__V_CMP_T_U64::~Inst_VOP3__V_CMP_T_U64()
-    {
-    } // ~Inst_VOP3__V_CMP_T_U64
-
-    // --- description from .arch file ---
-    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_F_I64::Inst_VOP3__V_CMPX_F_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_I64
-
-    Inst_VOP3__V_CMPX_F_I64::~Inst_VOP3__V_CMPX_F_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_I64::Inst_VOP3__V_CMPX_LT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_I64
-
-    Inst_VOP3__V_CMPX_LT_I64::~Inst_VOP3__V_CMPX_LT_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_I64::Inst_VOP3__V_CMPX_EQ_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_I64
-
-    Inst_VOP3__V_CMPX_EQ_I64::~Inst_VOP3__V_CMPX_EQ_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_I64::Inst_VOP3__V_CMPX_LE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_I64
-
-    Inst_VOP3__V_CMPX_LE_I64::~Inst_VOP3__V_CMPX_LE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_I64::Inst_VOP3__V_CMPX_GT_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_I64
-
-    Inst_VOP3__V_CMPX_GT_I64::~Inst_VOP3__V_CMPX_GT_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_I64::Inst_VOP3__V_CMPX_NE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_I64
-
-    Inst_VOP3__V_CMPX_NE_I64::~Inst_VOP3__V_CMPX_NE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_I64::Inst_VOP3__V_CMPX_GE_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_I64
-
-    Inst_VOP3__V_CMPX_GE_I64::~Inst_VOP3__V_CMPX_GE_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_I64 class methods ---
-
-    Inst_VOP3__V_CMPX_T_I64::Inst_VOP3__V_CMPX_T_I64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_i64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_I64
-
-    Inst_VOP3__V_CMPX_T_I64::~Inst_VOP3__V_CMPX_T_I64()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_I64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_F_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_F_U64::Inst_VOP3__V_CMPX_F_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_f_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_F_U64
-
-    Inst_VOP3__V_CMPX_F_U64::~Inst_VOP3__V_CMPX_F_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_F_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LT_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_LT_U64::Inst_VOP3__V_CMPX_LT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_lt_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LT_U64
-
-    Inst_VOP3__V_CMPX_LT_U64::~Inst_VOP3__V_CMPX_LT_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_EQ_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_EQ_U64::Inst_VOP3__V_CMPX_EQ_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_eq_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_EQ_U64
-
-    Inst_VOP3__V_CMPX_EQ_U64::~Inst_VOP3__V_CMPX_EQ_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_EQ_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_LE_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_LE_U64::Inst_VOP3__V_CMPX_LE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_le_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_LE_U64
-
-    Inst_VOP3__V_CMPX_LE_U64::~Inst_VOP3__V_CMPX_LE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_LE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GT_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_GT_U64::Inst_VOP3__V_CMPX_GT_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_gt_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GT_U64
-
-    Inst_VOP3__V_CMPX_GT_U64::~Inst_VOP3__V_CMPX_GT_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GT_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_NE_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_NE_U64::Inst_VOP3__V_CMPX_NE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ne_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_NE_U64
-
-    Inst_VOP3__V_CMPX_NE_U64::~Inst_VOP3__V_CMPX_NE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_NE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_GE_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_GE_U64::Inst_VOP3__V_CMPX_GE_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_ge_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_GE_U64
-
-    Inst_VOP3__V_CMPX_GE_U64::~Inst_VOP3__V_CMPX_GE_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_GE_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CMPX_T_U64 class methods ---
-
-    Inst_VOP3__V_CMPX_T_U64::Inst_VOP3__V_CMPX_T_U64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cmpx_t_u64", true)
-    {
-        setFlag(ALU);
-        setFlag(WritesEXEC);
-    } // Inst_VOP3__V_CMPX_T_U64
-
-    Inst_VOP3__V_CMPX_T_U64::~Inst_VOP3__V_CMPX_T_U64()
-    {
-    } // ~Inst_VOP3__V_CMPX_T_U64
-
-    // --- description from .arch file ---
-    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
-    void
-    Inst_VOP3__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                sdst.setBit(lane, 1);
-            }
-        }
-
-        wf->execMask() = sdst.rawData();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CNDMASK_B32 class methods ---
-
-    Inst_VOP3__V_CNDMASK_B32::Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cndmask_b32", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_CNDMASK_B32
-
-    Inst_VOP3__V_CNDMASK_B32::~Inst_VOP3__V_CNDMASK_B32()
-    {
-    } // ~Inst_VOP3__V_CNDMASK_B32
-
-    // --- description from .arch file ---
-    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
-    // as a scalar GPR in S2.
-    void
-    Inst_VOP3__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(vcc.rawData(), lane)
-                    ? src1[lane] : src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_F32 class methods ---
-
-    Inst_VOP3__V_ADD_F32::Inst_VOP3__V_ADD_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_ADD_F32
-
-    Inst_VOP3__V_ADD_F32::~Inst_VOP3__V_ADD_F32()
-    {
-    } // ~Inst_VOP3__V_ADD_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f + S1.f.
-    void
-    Inst_VOP3__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_F32 class methods ---
-
-    Inst_VOP3__V_SUB_F32::Inst_VOP3__V_SUB_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SUB_F32
-
-    Inst_VOP3__V_SUB_F32::~Inst_VOP3__V_SUB_F32()
-    {
-    } // ~Inst_VOP3__V_SUB_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - S1.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP3__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_F32 class methods ---
-
-    Inst_VOP3__V_SUBREV_F32::Inst_VOP3__V_SUBREV_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SUBREV_F32
-
-    Inst_VOP3__V_SUBREV_F32::~Inst_VOP3__V_SUBREV_F32()
-    {
-    } // ~Inst_VOP3__V_SUBREV_F32
-
-    // --- description from .arch file ---
-    // D.f = S1.f - S0.f.
-    // SQ translates to V_ADD_F32.
-    void
-    Inst_VOP3__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_MUL_LEGACY_F32::Inst_VOP3__V_MUL_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MUL_LEGACY_F32
-
-    Inst_VOP3__V_MUL_LEGACY_F32::~Inst_VOP3__V_MUL_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_MUL_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
-    void
-    Inst_VOP3__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_F32 class methods ---
-
-    Inst_VOP3__V_MUL_F32::Inst_VOP3__V_MUL_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MUL_F32
-
-    Inst_VOP3__V_MUL_F32::~Inst_VOP3__V_MUL_F32()
-    {
-    } // ~Inst_VOP3__V_MUL_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f.
-    void
-    Inst_VOP3__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_I32_I24 class methods ---
-
-    Inst_VOP3__V_MUL_I32_I24::Inst_VOP3__V_MUL_I32_I24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_i32_i24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_I32_I24
-
-    Inst_VOP3__V_MUL_I32_I24::~Inst_VOP3__V_MUL_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MUL_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = S0.i[23:0] * S1.i[23:0].
-    void
-    Inst_VOP3__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
-                    * sext<24>(bits(src1[lane], 23, 0));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_I32_I24 class methods ---
-
-    Inst_VOP3__V_MUL_HI_I32_I24::Inst_VOP3__V_MUL_HI_I32_I24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_i32_i24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_I32_I24
-
-    Inst_VOP3__V_MUL_HI_I32_I24::~Inst_VOP3__V_MUL_HI_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
-    void
-    Inst_VOP3__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 tmp_src0
-                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
-                VecElemI64 tmp_src1
-                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
-
-                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_U32_U24 class methods ---
-
-    Inst_VOP3__V_MUL_U32_U24::Inst_VOP3__V_MUL_U32_U24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_u32_u24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_U32_U24
-
-    Inst_VOP3__V_MUL_U32_U24::~Inst_VOP3__V_MUL_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MUL_U32_U24
-
-    // --- description from .arch file ---
-    // D.u = S0.u[23:0] * S1.u[23:0].
-    void
-    Inst_VOP3__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_U32_U24 class methods ---
-
-    Inst_VOP3__V_MUL_HI_U32_U24::Inst_VOP3__V_MUL_HI_U32_U24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_u32_u24", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_U32_U24
-
-    Inst_VOP3__V_MUL_HI_U32_U24::~Inst_VOP3__V_MUL_HI_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_U32_U24
-
-    // --- description from .arch file ---
-    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
-    void
-    Inst_VOP3__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
-                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
-                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_F32 class methods ---
-
-    Inst_VOP3__V_MIN_F32::Inst_VOP3__V_MIN_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MIN_F32
-
-    Inst_VOP3__V_MIN_F32::~Inst_VOP3__V_MIN_F32()
-    {
-    } // ~Inst_VOP3__V_MIN_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f < S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP3__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_F32 class methods ---
-
-    Inst_VOP3__V_MAX_F32::Inst_VOP3__V_MAX_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MAX_F32
-
-    Inst_VOP3__V_MAX_F32::~Inst_VOP3__V_MAX_F32()
-    {
-    } // ~Inst_VOP3__V_MAX_F32
-
-    // --- description from .arch file ---
-    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
-    void
-    Inst_VOP3__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_I32 class methods ---
-
-    Inst_VOP3__V_MIN_I32::Inst_VOP3__V_MIN_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_I32
-
-    Inst_VOP3__V_MIN_I32::~Inst_VOP3__V_MIN_I32()
-    {
-    } // ~Inst_VOP3__V_MIN_I32
-
-    // --- description from .arch file ---
-    // D.i = min(S0.i, S1.i).
-    void
-    Inst_VOP3__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_I32 class methods ---
-
-    Inst_VOP3__V_MAX_I32::Inst_VOP3__V_MAX_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_I32
-
-    Inst_VOP3__V_MAX_I32::~Inst_VOP3__V_MAX_I32()
-    {
-    } // ~Inst_VOP3__V_MAX_I32
-
-    // --- description from .arch file ---
-    // D.i = max(S0.i, S1.i).
-    void
-    Inst_VOP3__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_U32 class methods ---
-
-    Inst_VOP3__V_MIN_U32::Inst_VOP3__V_MIN_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_U32
-
-    Inst_VOP3__V_MIN_U32::~Inst_VOP3__V_MIN_U32()
-    {
-    } // ~Inst_VOP3__V_MIN_U32
-
-    // --- description from .arch file ---
-    // D.u = min(S0.u, S1.u).
-    void
-    Inst_VOP3__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_U32 class methods ---
-
-    Inst_VOP3__V_MAX_U32::Inst_VOP3__V_MAX_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_U32
-
-    Inst_VOP3__V_MAX_U32::~Inst_VOP3__V_MAX_U32()
-    {
-    } // ~Inst_VOP3__V_MAX_U32
-
-    // --- description from .arch file ---
-    // D.u = max(S0.u, S1.u).
-    void
-    Inst_VOP3__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHRREV_B32 class methods ---
-
-    Inst_VOP3__V_LSHRREV_B32::Inst_VOP3__V_LSHRREV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshrrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B32
-
-    Inst_VOP3__V_LSHRREV_B32::~Inst_VOP3__V_LSHRREV_B32()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u >> S0.u[4:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ASHRREV_I32 class methods ---
-
-    Inst_VOP3__V_ASHRREV_I32::Inst_VOP3__V_ASHRREV_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ashrrev_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I32
-
-    Inst_VOP3__V_ASHRREV_I32::~Inst_VOP3__V_ASHRREV_I32()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I32
-
-    // --- description from .arch file ---
-    // D.i = signext(S1.i) >> S0.i[4:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHLREV_B32 class methods ---
-
-    Inst_VOP3__V_LSHLREV_B32::Inst_VOP3__V_LSHLREV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshlrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B32
-
-    Inst_VOP3__V_LSHLREV_B32::~Inst_VOP3__V_LSHLREV_B32()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B32
-
-    // --- description from .arch file ---
-    // D.u = S1.u << S0.u[4:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_AND_B32 class methods ---
-
-    Inst_VOP3__V_AND_B32::Inst_VOP3__V_AND_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_and_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_AND_B32
-
-    Inst_VOP3__V_AND_B32::~Inst_VOP3__V_AND_B32()
-    {
-    } // ~Inst_VOP3__V_AND_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u & S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] & src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_OR_B32 class methods ---
-
-    Inst_VOP3__V_OR_B32::Inst_VOP3__V_OR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_or_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_OR_B32
-
-    Inst_VOP3__V_OR_B32::~Inst_VOP3__V_OR_B32()
-    {
-    } // ~Inst_VOP3__V_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] | src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_OR3_B32 class methods ---
-
-    Inst_VOP3__V_OR3_B32::Inst_VOP3__V_OR3_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_or3_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_OR3_B32
-
-    Inst_VOP3__V_OR3_B32::~Inst_VOP3__V_OR3_B32()
-    {
-    } // ~Inst_VOP3__V_OR3_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u | S1.u | S2.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_OR3_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] | src1[lane] | src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_XOR_B32 class methods ---
-
-    Inst_VOP3__V_XOR_B32::Inst_VOP3__V_XOR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_xor_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_XOR_B32
-
-    Inst_VOP3__V_XOR_B32::~Inst_VOP3__V_XOR_B32()
-    {
-    } // ~Inst_VOP3__V_XOR_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u ^ S1.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] ^ src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAC_F32 class methods ---
-
-    Inst_VOP3__V_MAC_F32::Inst_VOP3__V_MAC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mac_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAC);
-    } // Inst_VOP3__V_MAC_F32
-
-    Inst_VOP3__V_MAC_F32::~Inst_VOP3__V_MAC_F32()
-    {
-    } // ~Inst_VOP3__V_MAC_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + D.f.
-    // SQ translates to V_MAD_F32.
-    void
-    Inst_VOP3__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vdst.read();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_CO_U32 class methods ---
-
-    Inst_VOP3__V_ADD_CO_U32::Inst_VOP3__V_ADD_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_add_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_ADD_CO_U32
-
-    Inst_VOP3__V_ADD_CO_U32::~Inst_VOP3__V_ADD_CO_U32()
-    {
-    } // ~Inst_VOP3__V_ADD_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u;
-    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
-    // ---  overflow or carry-out for V_ADDC_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP3__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-                vcc.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUB_CO_U32::Inst_VOP3__V_SUB_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_sub_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_SUB_CO_U32
-
-    Inst_VOP3__V_SUB_CO_U32::~Inst_VOP3__V_SUB_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u;
-    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    void
-    Inst_VOP3__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUBREV_CO_U32::Inst_VOP3__V_SUBREV_CO_U32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-    } // Inst_VOP3__V_SUBREV_CO_U32
-
-    Inst_VOP3__V_SUBREV_CO_U32::~Inst_VOP3__V_SUBREV_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u;
-    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
-    // carry-out for V_SUBB_U32.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    // SQ translates this to V_SUB_U32 with reversed operands.
-    void
-    Inst_VOP3__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        vcc.write();
-    } // execute
-    // --- Inst_VOP3__V_ADDC_CO_U32 class methods ---
-
-    Inst_VOP3__V_ADDC_CO_U32::Inst_VOP3__V_ADDC_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_addc_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_ADDC_CO_U32
-
-    Inst_VOP3__V_ADDC_CO_U32::~Inst_VOP3__V_ADDC_CO_U32()
-    {
-    } // ~Inst_VOP3__V_ADDC_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + VCC[threadId];
-    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
-    // is an UNSIGNED overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP3__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane]
-                    + bits(vcc.rawData(), lane);
-                sdst.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane]
-                        + (VecElemU64)bits(vcc.rawData(), lane))
-                            >= 0x100000000 ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBB_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUBB_CO_U32::Inst_VOP3__V_SUBB_CO_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subb_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_SUBB_CO_U32
-
-    Inst_VOP3__V_SUBB_CO_U32::~Inst_VOP3__V_SUBB_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUBB_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u - S1.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // ---  overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // ---  source comes from the SGPR-pair at S2.u.
-    void
-    Inst_VOP3__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane]
-                    - bits(vcc.rawData(), lane);
-                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBBREV_CO_U32 class methods ---
-
-    Inst_VOP3__V_SUBBREV_CO_U32::Inst_VOP3__V_SUBBREV_CO_U32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subbrev_co_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(ReadsVCC);
-    } // Inst_VOP3__V_SUBBREV_CO_U32
-
-    Inst_VOP3__V_SUBBREV_CO_U32::~Inst_VOP3__V_SUBBREV_CO_U32()
-    {
-    } // ~Inst_VOP3__V_SUBBREV_CO_U32
-
-    // --- description from .arch file ---
-    // D.u = S1.u - S0.u - VCC[threadId];
-    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
-    // overflow.
-    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
-    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
-    void
-    Inst_VOP3__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
-        ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        vcc.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane]
-                    - bits(vcc.rawData(), lane);
-                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
-                    > src0[lane] ? 1 : 0);
-            }
-        }
-
-        vdst.write();
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_F16 class methods ---
-
-    Inst_VOP3__V_ADD_F16::Inst_VOP3__V_ADD_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_ADD_F16
-
-    Inst_VOP3__V_ADD_F16::~Inst_VOP3__V_ADD_F16()
-    {
-    } // ~Inst_VOP3__V_ADD_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP3__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SUB_F16 class methods ---
-
-    Inst_VOP3__V_SUB_F16::Inst_VOP3__V_SUB_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SUB_F16
-
-    Inst_VOP3__V_SUB_F16::~Inst_VOP3__V_SUB_F16()
-    {
-    } // ~Inst_VOP3__V_SUB_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 - S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP3__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_F16 class methods ---
-
-    Inst_VOP3__V_SUBREV_F16::Inst_VOP3__V_SUBREV_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SUBREV_F16
-
-    Inst_VOP3__V_SUBREV_F16::~Inst_VOP3__V_SUBREV_F16()
-    {
-    } // ~Inst_VOP3__V_SUBREV_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S1.f16 - S0.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    // SQ translates to V_ADD_F16.
-    void
-    Inst_VOP3__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MUL_F16 class methods ---
-
-    Inst_VOP3__V_MUL_F16::Inst_VOP3__V_MUL_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MUL_F16
-
-    Inst_VOP3__V_MUL_F16::~Inst_VOP3__V_MUL_F16()
-    {
-    } // ~Inst_VOP3__V_MUL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16.
-    // Supports denormals, round mode, exception flags, saturation.
-    void
-    Inst_VOP3__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAC_F16 class methods ---
-
-    Inst_VOP3__V_MAC_F16::Inst_VOP3__V_MAC_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mac_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAC);
-    } // Inst_VOP3__V_MAC_F16
-
-    Inst_VOP3__V_MAC_F16::~Inst_VOP3__V_MAC_F16()
-    {
-    } // ~Inst_VOP3__V_MAC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + D.f16.
-    // Supports round mode, exception flags, saturation.
-    // SQ translates this to V_MAD_F16.
-    void
-    Inst_VOP3__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_ADD_U16 class methods ---
-
-    Inst_VOP3__V_ADD_U16::Inst_VOP3__V_ADD_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD_U16
-
-    Inst_VOP3__V_ADD_U16::~Inst_VOP3__V_ADD_U16()
-    {
-    } // ~Inst_VOP3__V_ADD_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 + S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_U16 class methods ---
-
-    Inst_VOP3__V_SUB_U16::Inst_VOP3__V_SUB_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUB_U16
-
-    Inst_VOP3__V_SUB_U16::~Inst_VOP3__V_SUB_U16()
-    {
-    } // ~Inst_VOP3__V_SUB_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 - S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_U16 class methods ---
-
-    Inst_VOP3__V_SUBREV_U16::Inst_VOP3__V_SUBREV_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUBREV_U16
-
-    Inst_VOP3__V_SUBREV_U16::~Inst_VOP3__V_SUBREV_U16()
-    {
-    } // ~Inst_VOP3__V_SUBREV_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S1.u16 - S0.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    // SQ translates this to V_SUB_U16 with reversed operands.
-    void
-    Inst_VOP3__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_LO_U16 class methods ---
-
-    Inst_VOP3__V_MUL_LO_U16::Inst_VOP3__V_MUL_LO_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_lo_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_LO_U16
-
-    Inst_VOP3__V_MUL_LO_U16::~Inst_VOP3__V_MUL_LO_U16()
-    {
-    } // ~Inst_VOP3__V_MUL_LO_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 * S1.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHLREV_B16 class methods ---
-
-    Inst_VOP3__V_LSHLREV_B16::Inst_VOP3__V_LSHLREV_B16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshlrev_b16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B16
-
-    Inst_VOP3__V_LSHLREV_B16::~Inst_VOP3__V_LSHLREV_B16()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHRREV_B16 class methods ---
-
-    Inst_VOP3__V_LSHRREV_B16::Inst_VOP3__V_LSHRREV_B16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshrrev_b16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B16
-
-    Inst_VOP3__V_LSHRREV_B16::~Inst_VOP3__V_LSHRREV_B16()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ASHRREV_I16 class methods ---
-
-    Inst_VOP3__V_ASHRREV_I16::Inst_VOP3__V_ASHRREV_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ashrrev_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I16
-
-    Inst_VOP3__V_ASHRREV_I16::~Inst_VOP3__V_ASHRREV_I16()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_F16 class methods ---
-
-    Inst_VOP3__V_MAX_F16::Inst_VOP3__V_MAX_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MAX_F16
-
-    Inst_VOP3__V_MAX_F16::~Inst_VOP3__V_MAX_F16()
-    {
-    } // ~Inst_VOP3__V_MAX_F16
-
-    // --- description from .arch file ---
-    // D.f16 = max(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP3__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MIN_F16 class methods ---
-
-    Inst_VOP3__V_MIN_F16::Inst_VOP3__V_MIN_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_MIN_F16
-
-    Inst_VOP3__V_MIN_F16::~Inst_VOP3__V_MIN_F16()
-    {
-    } // ~Inst_VOP3__V_MIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = min(S0.f16, S1.f16).
-    // IEEE compliant. Supports denormals, round mode, exception flags,
-    // saturation.
-    void
-    Inst_VOP3__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAX_U16 class methods ---
-
-    Inst_VOP3__V_MAX_U16::Inst_VOP3__V_MAX_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_U16
-
-    Inst_VOP3__V_MAX_U16::~Inst_VOP3__V_MAX_U16()
-    {
-    } // ~Inst_VOP3__V_MAX_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP3__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_I16 class methods ---
-
-    Inst_VOP3__V_MAX_I16::Inst_VOP3__V_MAX_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX_I16
-
-    Inst_VOP3__V_MAX_I16::~Inst_VOP3__V_MAX_I16()
-    {
-    } // ~Inst_VOP3__V_MAX_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP3__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::max(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_U16 class methods ---
-
-    Inst_VOP3__V_MIN_U16::Inst_VOP3__V_MIN_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_U16
-
-    Inst_VOP3__V_MIN_U16::~Inst_VOP3__V_MIN_U16()
-    {
-    } // ~Inst_VOP3__V_MIN_U16
-
-    // --- description from .arch file ---
-    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
-    void
-    Inst_VOP3__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_I16 class methods ---
-
-    Inst_VOP3__V_MIN_I16::Inst_VOP3__V_MIN_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_i16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN_I16
-
-    Inst_VOP3__V_MIN_I16::~Inst_VOP3__V_MIN_I16()
-    {
-    } // ~Inst_VOP3__V_MIN_I16
-
-    // --- description from .arch file ---
-    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
-    void
-    Inst_VOP3__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::min(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LDEXP_F16 class methods ---
-
-    Inst_VOP3__V_LDEXP_F16::Inst_VOP3__V_LDEXP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ldexp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_LDEXP_F16
-
-    Inst_VOP3__V_LDEXP_F16::~Inst_VOP3__V_LDEXP_F16()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * (2 ** S1.i16).
-    void
-    Inst_VOP3__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_ADD_U32 class methods ---
-
-    Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD_U32
-
-    Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32()
-    {
-    } // ~Inst_VOP3__V_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u32 = S0.u32 + S1.u32.
-    void
-    Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUB_U32 class methods ---
-
-    Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sub_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUB_U32
-
-    Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32()
-    {
-    } // ~Inst_VOP3__V_SUB_U32
-
-    // --- description from .arch file ---
-    // D.u32 = S0.u32 - S1.u32.
-    void
-    Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] - src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SUBREV_U32 class methods ---
-
-    Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_subrev_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SUBREV_U32
-
-    Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32()
-    {
-    } // ~Inst_VOP3__V_SUBREV_U32
-
-    // --- description from .arch file ---
-    // D.u32 = S1.u32 - S0.u32.
-    void
-    Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] - src0[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_NOP class methods ---
-
-    Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_nop", false)
-    {
-        setFlag(Nop);
-        setFlag(ALU);
-    } // Inst_VOP3__V_NOP
-
-    Inst_VOP3__V_NOP::~Inst_VOP3__V_NOP()
-    {
-    } // ~Inst_VOP3__V_NOP
-
-    // --- description from .arch file ---
-    // Do nothing.
-    void
-    Inst_VOP3__V_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_VOP3__V_MOV_B32 class methods ---
-
-    Inst_VOP3__V_MOV_B32::Inst_VOP3__V_MOV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mov_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MOV_B32
-
-    Inst_VOP3__V_MOV_B32::~Inst_VOP3__V_MOV_B32()
-    {
-    } // ~Inst_VOP3__V_MOV_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_I32_F64 class methods ---
-
-    Inst_VOP3__V_CVT_I32_F64::Inst_VOP3__V_CVT_I32_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_i32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_I32_F64
-
-    Inst_VOP3__V_CVT_I32_F64::~Inst_VOP3__V_CVT_I32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_I32_F64
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F64_I32 class methods ---
-
-    Inst_VOP3__V_CVT_F64_I32::Inst_VOP3__V_CVT_F64_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f64_i32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_I32
-
-    Inst_VOP3__V_CVT_F64_I32::~Inst_VOP3__V_CVT_F64_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_I32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.i.
-    void
-    Inst_VOP3__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_I32 class methods ---
-
-    Inst_VOP3__V_CVT_F32_I32::Inst_VOP3__V_CVT_F32_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_i32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_I32
-
-    Inst_VOP3__V_CVT_F32_I32::~Inst_VOP3__V_CVT_F32_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_I32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.i.
-    void
-    Inst_VOP3__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        VecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_U32 class methods ---
-
-    Inst_VOP3__V_CVT_F32_U32::Inst_VOP3__V_CVT_F32_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_u32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_U32
-
-    Inst_VOP3__V_CVT_F32_U32::~Inst_VOP3__V_CVT_F32_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_U32
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.u.
-    void
-    Inst_VOP3__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_U32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_U32_F32::Inst_VOP3__V_CVT_U32_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_u32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_U32_F32
-
-    Inst_VOP3__V_CVT_U32_F32::~Inst_VOP3__V_CVT_U32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_U32_F32
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_I32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_I32_F32::Inst_VOP3__V_CVT_I32_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_I32_F32
-
-    Inst_VOP3__V_CVT_I32_F32::~Inst_VOP3__V_CVT_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)S0.f.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane]) || exp > 30) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = INT_MIN;
-                    } else {
-                        vdst[lane] = INT_MAX;
-                    }
-                } else {
-                    vdst[lane] = (VecElemI32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MOV_FED_B32 class methods ---
-
-    Inst_VOP3__V_MOV_FED_B32::Inst_VOP3__V_MOV_FED_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mov_fed_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MOV_FED_B32
-
-    Inst_VOP3__V_MOV_FED_B32::~Inst_VOP3__V_MOV_FED_B32()
-    {
-    } // ~Inst_VOP3__V_MOV_FED_B32
-
-    // --- description from .arch file ---
-    // D.u = S0.u;
-    // Introduce EDC double error upon write to dest vgpr without causing an
-    // ---  exception.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_F16_F32::Inst_VOP3__V_CVT_F16_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F16_F32
-
-    Inst_VOP3__V_CVT_F16_F32::~Inst_VOP3__V_CVT_F16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_F32
-
-    // --- description from .arch file ---
-    // D.f16 = flt32_to_flt16(S0.f).
-    // Supports input modifiers and creates FP16 denormals when appropriate.
-    void
-    Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_F16 class methods ---
-
-    Inst_VOP3__V_CVT_F32_F16::Inst_VOP3__V_CVT_F32_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_F16
-
-    Inst_VOP3__V_CVT_F32_F16::~Inst_VOP3__V_CVT_F32_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_F16
-
-    // --- description from .arch file ---
-    // D.f = flt16_to_flt32(S0.f16).
-    // FP16 denormal inputs are always accepted.
-    void
-    Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_RPI_I32_F32::Inst_VOP3__V_CVT_RPI_I32_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_rpi_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_RPI_I32_F32
-
-    Inst_VOP3__V_CVT_RPI_I32_F32::~Inst_VOP3__V_CVT_RPI_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_RPI_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f + 0.5).
-    void
-    Inst_VOP3__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_FLR_I32_F32 class methods ---
-
-    Inst_VOP3__V_CVT_FLR_I32_F32::Inst_VOP3__V_CVT_FLR_I32_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_flr_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_FLR_I32_F32
-
-    Inst_VOP3__V_CVT_FLR_I32_F32::~Inst_VOP3__V_CVT_FLR_I32_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_FLR_I32_F32
-
-    // --- description from .arch file ---
-    // D.i = (int)floor(S0.f).
-    void
-    Inst_VOP3__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemI32)std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_OFF_F32_I4 class methods ---
-
-    Inst_VOP3__V_CVT_OFF_F32_I4::Inst_VOP3__V_CVT_OFF_F32_I4(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_off_f32_i4", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_OFF_F32_I4
-
-    Inst_VOP3__V_CVT_OFF_F32_I4::~Inst_VOP3__V_CVT_OFF_F32_I4()
-    {
-    } // ~Inst_VOP3__V_CVT_OFF_F32_I4
-
-    // --- description from .arch file ---
-    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
-    void
-    Inst_VOP3__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        // Could not parse sq_uc.arch desc field
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_F64 class methods ---
-
-    Inst_VOP3__V_CVT_F32_F64::Inst_VOP3__V_CVT_F32_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F32_F64
-
-    Inst_VOP3__V_CVT_F32_F64::~Inst_VOP3__V_CVT_F32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_F64
-
-    // --- description from .arch file ---
-    // D.f = (float)S0.d.
-    void
-    Inst_VOP3__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F64_F32 class methods ---
-
-    Inst_VOP3__V_CVT_F64_F32::Inst_VOP3__V_CVT_F64_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f64_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_F32
-
-    Inst_VOP3__V_CVT_F64_F32::~Inst_VOP3__V_CVT_F64_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_F32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.f.
-    void
-    Inst_VOP3__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE0 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE0::Inst_VOP3__V_CVT_F32_UBYTE0(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte0", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE0
-
-    Inst_VOP3__V_CVT_F32_UBYTE0::~Inst_VOP3__V_CVT_F32_UBYTE0()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE0
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[7:0]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE1 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE1::Inst_VOP3__V_CVT_F32_UBYTE1(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte1", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE1
-
-    Inst_VOP3__V_CVT_F32_UBYTE1::~Inst_VOP3__V_CVT_F32_UBYTE1()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE1
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[15:8]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE2 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE2::Inst_VOP3__V_CVT_F32_UBYTE2(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte2", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE2
-
-    Inst_VOP3__V_CVT_F32_UBYTE2::~Inst_VOP3__V_CVT_F32_UBYTE2()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE2
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[23:16]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F32_UBYTE3 class methods ---
-
-    Inst_VOP3__V_CVT_F32_UBYTE3::Inst_VOP3__V_CVT_F32_UBYTE3(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte3", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_F32_UBYTE3
-
-    Inst_VOP3__V_CVT_F32_UBYTE3::~Inst_VOP3__V_CVT_F32_UBYTE3()
-    {
-    } // ~Inst_VOP3__V_CVT_F32_UBYTE3
-
-    // --- description from .arch file ---
-    // D.f = (float)(S0.u[31:24]).
-    void
-    Inst_VOP3__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_U32_F64 class methods ---
-
-    Inst_VOP3__V_CVT_U32_F64::Inst_VOP3__V_CVT_U32_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_u32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_U32_F64
-
-    Inst_VOP3__V_CVT_U32_F64::~Inst_VOP3__V_CVT_U32_F64()
-    {
-    } // ~Inst_VOP3__V_CVT_U32_F64
-
-    // --- description from .arch file ---
-    // D.u = (unsigned)S0.d.
-    // Out-of-range floating point values (including infinity) saturate. NaN is
-    // ---  converted to 0.
-    void
-    Inst_VOP3__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp;
-                std::frexp(src[lane],&exp);
-                if (std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = 0;
-                    } else {
-                        vdst[lane] = UINT_MAX;
-                    }
-                } else if (exp > 31) {
-                    vdst[lane] = UINT_MAX;
-                } else {
-                    vdst[lane] = (VecElemU32)src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F64_U32 class methods ---
-
-    Inst_VOP3__V_CVT_F64_U32::Inst_VOP3__V_CVT_F64_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f64_u32", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CVT_F64_U32
-
-    Inst_VOP3__V_CVT_F64_U32::~Inst_VOP3__V_CVT_F64_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_F64_U32
-
-    // --- description from .arch file ---
-    // D.d = (double)S0.u.
-    void
-    Inst_VOP3__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (VecElemF64)src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_TRUNC_F64 class methods ---
-
-    Inst_VOP3__V_TRUNC_F64::Inst_VOP3__V_TRUNC_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trunc_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_TRUNC_F64
-
-    Inst_VOP3__V_TRUNC_F64::~Inst_VOP3__V_TRUNC_F64()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d), return integer part of S0.d.
-    void
-    Inst_VOP3__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CEIL_F64 class methods ---
-
-    Inst_VOP3__V_CEIL_F64::Inst_VOP3__V_CEIL_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ceil_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_CEIL_F64
-
-    Inst_VOP3__V_CEIL_F64::~Inst_VOP3__V_CEIL_F64()
-    {
-    } // ~Inst_VOP3__V_CEIL_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
-    void
-    Inst_VOP3__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RNDNE_F64 class methods ---
-
-    Inst_VOP3__V_RNDNE_F64::Inst_VOP3__V_RNDNE_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rndne_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RNDNE_F64
-
-    Inst_VOP3__V_RNDNE_F64::~Inst_VOP3__V_RNDNE_F64()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F64
-
-    // --- description from .arch file ---
-    // D.d = round_nearest_even(S0.d).
-    void
-    Inst_VOP3__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FLOOR_F64 class methods ---
-
-    Inst_VOP3__V_FLOOR_F64::Inst_VOP3__V_FLOOR_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_floor_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FLOOR_F64
-
-    Inst_VOP3__V_FLOOR_F64::~Inst_VOP3__V_FLOOR_F64()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F64
-
-    // --- description from .arch file ---
-    // D.d = trunc(S0.d);
-    // if(S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
-    void
-    Inst_VOP3__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FRACT_F32 class methods ---
-
-    Inst_VOP3__V_FRACT_F32::Inst_VOP3__V_FRACT_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fract_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FRACT_F32
-
-    Inst_VOP3__V_FRACT_F32::~Inst_VOP3__V_FRACT_F32()
-    {
-    } // ~Inst_VOP3__V_FRACT_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f - floor(S0.f).
-    void
-    Inst_VOP3__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_TRUNC_F32 class methods ---
-
-    Inst_VOP3__V_TRUNC_F32::Inst_VOP3__V_TRUNC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trunc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_TRUNC_F32
-
-    Inst_VOP3__V_TRUNC_F32::~Inst_VOP3__V_TRUNC_F32()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f), return integer part of S0.f.
-    void
-    Inst_VOP3__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::trunc(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CEIL_F32 class methods ---
-
-    Inst_VOP3__V_CEIL_F32::Inst_VOP3__V_CEIL_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ceil_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CEIL_F32
-
-    Inst_VOP3__V_CEIL_F32::~Inst_VOP3__V_CEIL_F32()
-    {
-    } // ~Inst_VOP3__V_CEIL_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
-    void
-    Inst_VOP3__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ceil(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RNDNE_F32 class methods ---
-
-    Inst_VOP3__V_RNDNE_F32::Inst_VOP3__V_RNDNE_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rndne_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RNDNE_F32
-
-    Inst_VOP3__V_RNDNE_F32::~Inst_VOP3__V_RNDNE_F32()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F32
-
-    // --- description from .arch file ---
-    // D.f = round_nearest_even(S0.f).
-    void
-    Inst_VOP3__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = roundNearestEven(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FLOOR_F32 class methods ---
-
-    Inst_VOP3__V_FLOOR_F32::Inst_VOP3__V_FLOOR_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_floor_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FLOOR_F32
-
-    Inst_VOP3__V_FLOOR_F32::~Inst_VOP3__V_FLOOR_F32()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F32
-
-    // --- description from .arch file ---
-    // D.f = trunc(S0.f);
-    // if(S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
-    void
-    Inst_VOP3__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::floor(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_EXP_F32 class methods ---
-
-    Inst_VOP3__V_EXP_F32::Inst_VOP3__V_EXP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_exp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_EXP_F32
-
-    Inst_VOP3__V_EXP_F32::~Inst_VOP3__V_EXP_F32()
-    {
-    } // ~Inst_VOP3__V_EXP_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f).
-    void
-    Inst_VOP3__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LOG_F32 class methods ---
-
-    Inst_VOP3__V_LOG_F32::Inst_VOP3__V_LOG_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_log_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LOG_F32
-
-    Inst_VOP3__V_LOG_F32::~Inst_VOP3__V_LOG_F32()
-    {
-    } // ~Inst_VOP3__V_LOG_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm.
-    void
-    Inst_VOP3__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RCP_F32 class methods ---
-
-    Inst_VOP3__V_RCP_F32::Inst_VOP3__V_RCP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RCP_F32
-
-    Inst_VOP3__V_RCP_F32::~Inst_VOP3__V_RCP_F32()
-    {
-    } // ~Inst_VOP3__V_RCP_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
-    void
-    Inst_VOP3__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RCP_IFLAG_F32 class methods ---
-
-    Inst_VOP3__V_RCP_IFLAG_F32::Inst_VOP3__V_RCP_IFLAG_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_iflag_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RCP_IFLAG_F32
-
-    Inst_VOP3__V_RCP_IFLAG_F32::~Inst_VOP3__V_RCP_IFLAG_F32()
-    {
-    } // ~Inst_VOP3__V_RCP_IFLAG_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
-    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
-    // ---  exceptions.
-    void
-    Inst_VOP3__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RSQ_F32 class methods ---
-
-    Inst_VOP3__V_RSQ_F32::Inst_VOP3__V_RSQ_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rsq_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_RSQ_F32
-
-    Inst_VOP3__V_RSQ_F32::~Inst_VOP3__V_RSQ_F32()
-    {
-    } // ~Inst_VOP3__V_RSQ_F32
-
-    // --- description from .arch file ---
-    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
-    void
-    Inst_VOP3__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = 1.0 / std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RCP_F64 class methods ---
-
-    Inst_VOP3__V_RCP_F64::Inst_VOP3__V_RCP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RCP_F64
-
-    Inst_VOP3__V_RCP_F64::~Inst_VOP3__V_RCP_F64()
-    {
-    } // ~Inst_VOP3__V_RCP_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / S0.d.
-    void
-    Inst_VOP3__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane])) {
-                    if (std::signbit(src[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = 0.0;
-                    }
-                } else {
-                    vdst[lane] = 1.0 / src[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_RSQ_F64 class methods ---
-
-    Inst_VOP3__V_RSQ_F64::Inst_VOP3__V_RSQ_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rsq_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_RSQ_F64
-
-    Inst_VOP3__V_RSQ_F64::~Inst_VOP3__V_RSQ_F64()
-    {
-    } // ~Inst_VOP3__V_RSQ_F64
-
-    // --- description from .arch file ---
-    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
-    void
-    Inst_VOP3__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src[lane]) == FP_ZERO) {
-                    vdst[lane] = +INFINITY;
-                } else if (std::isnan(src[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
-                    vdst[lane] = 0.0;
-                } else if (std::signbit(src[lane])) {
-                    vdst[lane] = NAN;
-                } else {
-                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SQRT_F32 class methods ---
-
-    Inst_VOP3__V_SQRT_F32::Inst_VOP3__V_SQRT_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sqrt_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SQRT_F32
-
-    Inst_VOP3__V_SQRT_F32::~Inst_VOP3__V_SQRT_F32()
-    {
-    } // ~Inst_VOP3__V_SQRT_F32
-
-    // --- description from .arch file ---
-    // D.f = sqrt(S0.f).
-    void
-    Inst_VOP3__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SQRT_F64 class methods ---
-
-    Inst_VOP3__V_SQRT_F64::Inst_VOP3__V_SQRT_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sqrt_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_SQRT_F64
-
-    Inst_VOP3__V_SQRT_F64::~Inst_VOP3__V_SQRT_F64()
-    {
-    } // ~Inst_VOP3__V_SQRT_F64
-
-    // --- description from .arch file ---
-    // D.d = sqrt(S0.d).
-    void
-    Inst_VOP3__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sqrt(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SIN_F32 class methods ---
-
-    Inst_VOP3__V_SIN_F32::Inst_VOP3__V_SIN_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sin_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_SIN_F32
-
-    Inst_VOP3__V_SIN_F32::~Inst_VOP3__V_SIN_F32()
-    {
-    } // ~Inst_VOP3__V_SIN_F32
-
-    // --- description from .arch file ---
-    // D.f = sin(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 0.0.
-    void
-    Inst_VOP3__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_COS_F32 class methods ---
-
-    Inst_VOP3__V_COS_F32::Inst_VOP3__V_COS_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cos_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_COS_F32
-
-    Inst_VOP3__V_COS_F32::~Inst_VOP3__V_COS_F32()
-    {
-    } // ~Inst_VOP3__V_COS_F32
-
-    // --- description from .arch file ---
-    // D.f = cos(S0.f * 2 * PI).
-    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
-    // float 1.0.
-    void
-    Inst_VOP3__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-        pi.read();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_NOT_B32 class methods ---
-
-    Inst_VOP3__V_NOT_B32::Inst_VOP3__V_NOT_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_not_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_NOT_B32
-
-    Inst_VOP3__V_NOT_B32::~Inst_VOP3__V_NOT_B32()
-    {
-    } // ~Inst_VOP3__V_NOT_B32
-
-    // --- description from .arch file ---
-    // D.u = ~S0.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ~src[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BFREV_B32 class methods ---
-
-    Inst_VOP3__V_BFREV_B32::Inst_VOP3__V_BFREV_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfrev_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFREV_B32
-
-    Inst_VOP3__V_BFREV_B32::~Inst_VOP3__V_BFREV_B32()
-    {
-    } // ~Inst_VOP3__V_BFREV_B32
-
-    // --- description from .arch file ---
-    // D.u[31:0] = S0.u[0:31], bitfield reverse.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = reverseBits(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FFBH_U32 class methods ---
-
-    Inst_VOP3__V_FFBH_U32::Inst_VOP3__V_FFBH_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ffbh_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBH_U32
-
-    Inst_VOP3__V_FFBH_U32::~Inst_VOP3__V_FFBH_U32()
-    {
-    } // ~Inst_VOP3__V_FFBH_U32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from MSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP3__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOneMsb(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FFBL_B32 class methods ---
-
-    Inst_VOP3__V_FFBL_B32::Inst_VOP3__V_FFBL_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ffbl_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBL_B32
-
-    Inst_VOP3__V_FFBL_B32::~Inst_VOP3__V_FFBL_B32()
-    {
-    } // ~Inst_VOP3__V_FFBL_B32
-
-    // --- description from .arch file ---
-    // D.u = position of first 1 in S0.u from LSB;
-    // D.u = 0xffffffff if S0.u == 0.
-    void
-    Inst_VOP3__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = findFirstOne(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FFBH_I32 class methods ---
-
-    Inst_VOP3__V_FFBH_I32::Inst_VOP3__V_FFBH_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ffbh_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_FFBH_I32
-
-    Inst_VOP3__V_FFBH_I32::~Inst_VOP3__V_FFBH_I32()
-    {
-    } // ~Inst_VOP3__V_FFBH_I32
-
-    // --- description from .arch file ---
-    // D.u = position of first bit different from sign bit in S0.i from MSB;
-    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
-    void
-    Inst_VOP3__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = firstOppositeSignBit(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_EXP_I32_F64 class methods ---
-
-    Inst_VOP3__V_FREXP_EXP_I32_F64::Inst_VOP3__V_FREXP_EXP_I32_F64(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FREXP_EXP_I32_F64
-
-    Inst_VOP3__V_FREXP_EXP_I32_F64::~Inst_VOP3__V_FREXP_EXP_I32_F64()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I32_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_EXP_I32_F32.
-    void
-    Inst_VOP3__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_MANT_F64 class methods ---
-
-    Inst_VOP3__V_FREXP_MANT_F64::Inst_VOP3__V_FREXP_MANT_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_mant_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FREXP_MANT_F64
-
-    Inst_VOP3__V_FREXP_MANT_F64::~Inst_VOP3__V_FREXP_MANT_F64()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F64
-
-    // --- description from .arch file ---
-    // See V_FREXP_MANT_F32.
-    void
-    Inst_VOP3__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 exp(0);
-                vdst[lane] = std::frexp(src[lane], &exp);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FRACT_F64 class methods ---
-
-    Inst_VOP3__V_FRACT_F64::Inst_VOP3__V_FRACT_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fract_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_FRACT_F64
-
-    Inst_VOP3__V_FRACT_F64::~Inst_VOP3__V_FRACT_F64()
-    {
-    } // ~Inst_VOP3__V_FRACT_F64
-
-    // --- description from .arch file ---
-    // See V_FRACT_F32.
-    void
-    Inst_VOP3__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 int_part(0.0);
-                vdst[lane] = std::modf(src[lane], &int_part);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_EXP_I32_F32 class methods ---
-
-    Inst_VOP3__V_FREXP_EXP_I32_F32::Inst_VOP3__V_FREXP_EXP_I32_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FREXP_EXP_I32_F32
-
-    Inst_VOP3__V_FREXP_EXP_I32_F32::~Inst_VOP3__V_FREXP_EXP_I32_F32()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I32_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.i = 0;
-    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
-    // Returns exponent of single precision float input, such that S0.f =
-    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
-    // the significand.
-    void
-    Inst_VOP3__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane])|| std::isnan(src[lane])) {
-                    vdst[lane] = 0;
-                } else {
-                    VecElemI32 exp(0);
-                    std::frexp(src[lane], &exp);
-                    vdst[lane] = exp;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_MANT_F32 class methods ---
-
-    Inst_VOP3__V_FREXP_MANT_F32::Inst_VOP3__V_FREXP_MANT_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_mant_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_FREXP_MANT_F32
-
-    Inst_VOP3__V_FREXP_MANT_F32::~Inst_VOP3__V_FREXP_MANT_F32()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F32
-
-    // --- description from .arch file ---
-    // if(S0.f == INF || S0.f == NAN) then D.f = S0.f;
-    // else D.f = Mantissa(S0.f).
-    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
-    // ---  significand of single precision float input, such that S0.f =
-    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
-    // ---  returns integer exponent.
-    void
-    Inst_VOP3__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
-                    vdst[lane] = src[lane];
-                } else {
-                    VecElemI32 exp(0);
-                    vdst[lane] = std::frexp(src[lane], &exp);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CLREXCP class methods ---
-
-    Inst_VOP3__V_CLREXCP::Inst_VOP3__V_CLREXCP(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_clrexcp", false)
-    {
-    } // Inst_VOP3__V_CLREXCP
-
-    Inst_VOP3__V_CLREXCP::~Inst_VOP3__V_CLREXCP()
-    {
-    } // ~Inst_VOP3__V_CLREXCP
-
-    // --- description from .arch file ---
-    // Clear wave's exception state in SIMD (SP).
-    void
-    Inst_VOP3__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F16_U16 class methods ---
-
-    Inst_VOP3__V_CVT_F16_U16::Inst_VOP3__V_CVT_F16_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f16_u16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_F16_U16
-
-    Inst_VOP3__V_CVT_F16_U16::~Inst_VOP3__V_CVT_F16_U16()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_U16
-
-    // --- description from .arch file ---
-    // D.f16 = uint16_to_flt16(S.u16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_F16_I16 class methods ---
-
-    Inst_VOP3__V_CVT_F16_I16::Inst_VOP3__V_CVT_F16_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_f16_i16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_F16_I16
-
-    Inst_VOP3__V_CVT_F16_I16::~Inst_VOP3__V_CVT_F16_I16()
-    {
-    } // ~Inst_VOP3__V_CVT_F16_I16
-
-    // --- description from .arch file ---
-    // D.f16 = int16_to_flt16(S.i16).
-    // Supports denormals, rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_U16_F16 class methods ---
-
-    Inst_VOP3__V_CVT_U16_F16::Inst_VOP3__V_CVT_U16_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_u16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_U16_F16
-
-    Inst_VOP3__V_CVT_U16_F16::~Inst_VOP3__V_CVT_U16_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_U16_F16
-
-    // --- description from .arch file ---
-    // D.u16 = flt16_to_uint16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_I16_F16 class methods ---
-
-    Inst_VOP3__V_CVT_I16_F16::Inst_VOP3__V_CVT_I16_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_i16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CVT_I16_F16
-
-    Inst_VOP3__V_CVT_I16_F16::~Inst_VOP3__V_CVT_I16_F16()
-    {
-    } // ~Inst_VOP3__V_CVT_I16_F16
-
-    // --- description from .arch file ---
-    // D.i16 = flt16_to_int16(S.f16).
-    // Supports rounding, exception flags and saturation.
-    void
-    Inst_VOP3__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_RCP_F16 class methods ---
-
-    Inst_VOP3__V_RCP_F16::Inst_VOP3__V_RCP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rcp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RCP_F16
-
-    Inst_VOP3__V_RCP_F16::~Inst_VOP3__V_RCP_F16()
-    {
-    } // ~Inst_VOP3__V_RCP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecip(S0.f16).
-    void
-    Inst_VOP3__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SQRT_F16 class methods ---
-
-    Inst_VOP3__V_SQRT_F16::Inst_VOP3__V_SQRT_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sqrt_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SQRT_F16
-
-    Inst_VOP3__V_SQRT_F16::~Inst_VOP3__V_SQRT_F16()
-    {
-    } // ~Inst_VOP3__V_SQRT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateSqrt(S0.f16).
-    void
-    Inst_VOP3__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_RSQ_F16 class methods ---
-
-    Inst_VOP3__V_RSQ_F16::Inst_VOP3__V_RSQ_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rsq_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RSQ_F16
-
-    Inst_VOP3__V_RSQ_F16::~Inst_VOP3__V_RSQ_F16()
-    {
-    } // ~Inst_VOP3__V_RSQ_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = ApproximateRecipSqrt(S0.f16).
-    void
-    Inst_VOP3__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_LOG_F16 class methods ---
-
-    Inst_VOP3__V_LOG_F16::Inst_VOP3__V_LOG_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_log_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_LOG_F16
-
-    Inst_VOP3__V_LOG_F16::~Inst_VOP3__V_LOG_F16()
-    {
-    } // ~Inst_VOP3__V_LOG_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 1.0f)
-    //     D.f16 = 0.0f;
-    // else
-    //     D.f16 = ApproximateLog2(S0.f16).
-    void
-    Inst_VOP3__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_EXP_F16 class methods ---
-
-    Inst_VOP3__V_EXP_F16::Inst_VOP3__V_EXP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_exp_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_EXP_F16
-
-    Inst_VOP3__V_EXP_F16::~Inst_VOP3__V_EXP_F16()
-    {
-    } // ~Inst_VOP3__V_EXP_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == 0.0f)
-    //     D.f16 = 1.0f;
-    // else
-    //     D.f16 = Approximate2ToX(S0.f16).
-    void
-    Inst_VOP3__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_MANT_F16 class methods ---
-
-    Inst_VOP3__V_FREXP_MANT_F16::Inst_VOP3__V_FREXP_MANT_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_mant_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FREXP_MANT_F16
-
-    Inst_VOP3__V_FREXP_MANT_F16::~Inst_VOP3__V_FREXP_MANT_F16()
-    {
-    } // ~Inst_VOP3__V_FREXP_MANT_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.f16 = S0.f16;
-    // else
-    //     D.f16 = mantissa(S0.f16).
-    // Result range is (-1.0,-0.5][0.5,1.0).
-    // C math library frexp function.
-    // Returns binary significand of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP3__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FREXP_EXP_I16_F16 class methods ---
-
-    Inst_VOP3__V_FREXP_EXP_I16_F16::Inst_VOP3__V_FREXP_EXP_I16_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_frexp_exp_i16_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FREXP_EXP_I16_F16
-
-    Inst_VOP3__V_FREXP_EXP_I16_F16::~Inst_VOP3__V_FREXP_EXP_I16_F16()
-    {
-    } // ~Inst_VOP3__V_FREXP_EXP_I16_F16
-
-    // --- description from .arch file ---
-    // if(S0.f16 == +-INF || S0.f16 == NAN)
-    //     D.i16 = 0;
-    // else
-    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
-    // C math library frexp function.
-    // Returns exponent of half precision float input, such that the
-    // original single float = significand * (2 ** exponent).
-    void
-    Inst_VOP3__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FLOOR_F16 class methods ---
-
-    Inst_VOP3__V_FLOOR_F16::Inst_VOP3__V_FLOOR_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_floor_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FLOOR_F16
-
-    Inst_VOP3__V_FLOOR_F16::~Inst_VOP3__V_FLOOR_F16()
-    {
-    } // ~Inst_VOP3__V_FLOOR_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
-    void
-    Inst_VOP3__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CEIL_F16 class methods ---
-
-    Inst_VOP3__V_CEIL_F16::Inst_VOP3__V_CEIL_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ceil_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_CEIL_F16
-
-    Inst_VOP3__V_CEIL_F16::~Inst_VOP3__V_CEIL_F16()
-    {
-    } // ~Inst_VOP3__V_CEIL_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16);
-    // if(S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
-    void
-    Inst_VOP3__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_TRUNC_F16 class methods ---
-
-    Inst_VOP3__V_TRUNC_F16::Inst_VOP3__V_TRUNC_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trunc_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_TRUNC_F16
-
-    Inst_VOP3__V_TRUNC_F16::~Inst_VOP3__V_TRUNC_F16()
-    {
-    } // ~Inst_VOP3__V_TRUNC_F16
-
-    // --- description from .arch file ---
-    // D.f16 = trunc(S0.f16).
-    // Round-to-zero semantics.
-    void
-    Inst_VOP3__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_RNDNE_F16 class methods ---
-
-    Inst_VOP3__V_RNDNE_F16::Inst_VOP3__V_RNDNE_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_rndne_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_RNDNE_F16
-
-    Inst_VOP3__V_RNDNE_F16::~Inst_VOP3__V_RNDNE_F16()
-    {
-    } // ~Inst_VOP3__V_RNDNE_F16
-
-    // --- description from .arch file ---
-    // D.f16 = FLOOR(S0.f16 + 0.5f);
-    // if(floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
-    // Round-to-nearest-even semantics.
-    void
-    Inst_VOP3__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_FRACT_F16 class methods ---
-
-    Inst_VOP3__V_FRACT_F16::Inst_VOP3__V_FRACT_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fract_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_FRACT_F16
-
-    Inst_VOP3__V_FRACT_F16::~Inst_VOP3__V_FRACT_F16()
-    {
-    } // ~Inst_VOP3__V_FRACT_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 + -floor(S0.f16).
-    void
-    Inst_VOP3__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_SIN_F16 class methods ---
-
-    Inst_VOP3__V_SIN_F16::Inst_VOP3__V_SIN_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sin_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_SIN_F16
-
-    Inst_VOP3__V_SIN_F16::~Inst_VOP3__V_SIN_F16()
-    {
-    } // ~Inst_VOP3__V_SIN_F16
-
-    // --- description from .arch file ---
-    // D.f16 = sin(S0.f16 * 2 * PI).
-    void
-    Inst_VOP3__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_COS_F16 class methods ---
-
-    Inst_VOP3__V_COS_F16::Inst_VOP3__V_COS_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cos_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_COS_F16
-
-    Inst_VOP3__V_COS_F16::~Inst_VOP3__V_COS_F16()
-    {
-    } // ~Inst_VOP3__V_COS_F16
-
-    // --- description from .arch file ---
-    // D.f16 = cos(S0.f16 * 2 * PI).
-    void
-    Inst_VOP3__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_EXP_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_EXP_LEGACY_F32::Inst_VOP3__V_EXP_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_exp_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_EXP_LEGACY_F32
-
-    Inst_VOP3__V_EXP_LEGACY_F32::~Inst_VOP3__V_EXP_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_EXP_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(2.0, S0.f) with legacy semantics.
-    void
-    Inst_VOP3__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::pow(2.0, src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LOG_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_LOG_LEGACY_F32::Inst_VOP3__V_LOG_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_log_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LOG_LEGACY_F32
-
-    Inst_VOP3__V_LOG_LEGACY_F32::~Inst_VOP3__V_LOG_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_LOG_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
-    void
-    Inst_VOP3__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::log2(src[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_LEGACY_F32 class methods ---
-
-    Inst_VOP3__V_MAD_LEGACY_F32::Inst_VOP3__V_MAD_LEGACY_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_legacy_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_LEGACY_F32
-
-    Inst_VOP3__V_MAD_LEGACY_F32::~Inst_VOP3__V_MAD_LEGACY_F32()
-    {
-    } // ~Inst_VOP3__V_MAD_LEGACY_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + S2.f (DX9 rules, 0.0 * x = 0.0).
-    void
-    Inst_VOP3__V_MAD_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_F32 class methods ---
-
-    Inst_VOP3__V_MAD_F32::Inst_VOP3__V_MAD_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_F32
-
-    Inst_VOP3__V_MAD_F32::~Inst_VOP3__V_MAD_F32()
-    {
-    } // ~Inst_VOP3__V_MAD_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + S2.f.
-    void
-    Inst_VOP3__V_MAD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_I32_I24 class methods ---
-
-    Inst_VOP3__V_MAD_I32_I24::Inst_VOP3__V_MAD_I32_I24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_i32_i24", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I32_I24
-
-    Inst_VOP3__V_MAD_I32_I24::~Inst_VOP3__V_MAD_I32_I24()
-    {
-    } // ~Inst_VOP3__V_MAD_I32_I24
-
-    // --- description from .arch file ---
-    // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
-    void
-    Inst_VOP3__V_MAD_I32_I24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
-                    * sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_U32_U24 class methods ---
-
-    Inst_VOP3__V_MAD_U32_U24::Inst_VOP3__V_MAD_U32_U24(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_u32_u24", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U32_U24
-
-    Inst_VOP3__V_MAD_U32_U24::~Inst_VOP3__V_MAD_U32_U24()
-    {
-    } // ~Inst_VOP3__V_MAD_U32_U24
-
-    // --- description from .arch file ---
-    // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
-    void
-    Inst_VOP3__V_MAD_U32_U24::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
-                    + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CUBEID_F32 class methods ---
-
-    Inst_VOP3__V_CUBEID_F32::Inst_VOP3__V_CUBEID_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubeid_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBEID_F32
-
-    Inst_VOP3__V_CUBEID_F32::~Inst_VOP3__V_CUBEID_F32()
-    {
-    } // ~Inst_VOP3__V_CUBEID_F32
-
-    // --- description from .arch file ---
-    // D.f = cubemap face ID ({0.0, 1.0, ..., 5.0}). XYZ coordinate is given in
-    // ---  (S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_CUBEID_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CUBESC_F32 class methods ---
-
-    Inst_VOP3__V_CUBESC_F32::Inst_VOP3__V_CUBESC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubesc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBESC_F32
-
-    Inst_VOP3__V_CUBESC_F32::~Inst_VOP3__V_CUBESC_F32()
-    {
-    } // ~Inst_VOP3__V_CUBESC_F32
-
-    // --- description from .arch file ---
-    // D.f = cubemap S coordinate. XYZ coordinate is given in (S0.f, S1.f,
-    // S2.f).
-    void
-    Inst_VOP3__V_CUBESC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CUBETC_F32 class methods ---
-
-    Inst_VOP3__V_CUBETC_F32::Inst_VOP3__V_CUBETC_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubetc_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBETC_F32
-
-    Inst_VOP3__V_CUBETC_F32::~Inst_VOP3__V_CUBETC_F32()
-    {
-    } // ~Inst_VOP3__V_CUBETC_F32
-
-    // --- description from .arch file ---
-    // D.f = cubemap T coordinate. XYZ coordinate is given in (S0.f, S1.f,
-    // S2.f).
-    void
-    Inst_VOP3__V_CUBETC_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CUBEMA_F32 class methods ---
-
-    Inst_VOP3__V_CUBEMA_F32::Inst_VOP3__V_CUBEMA_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cubema_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CUBEMA_F32
-
-    Inst_VOP3__V_CUBEMA_F32::~Inst_VOP3__V_CUBEMA_F32()
-    {
-    } // ~Inst_VOP3__V_CUBEMA_F32
-
-    // --- description from .arch file ---
-    // D.f = 2.0 * cubemap major axis. XYZ coordinate is given in (S0.f, S1.f,
-    // ---  S2.f).
-    void
-    Inst_VOP3__V_CUBEMA_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_BFE_U32 class methods ---
-
-    Inst_VOP3__V_BFE_U32::Inst_VOP3__V_BFE_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfe_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFE_U32
-
-    Inst_VOP3__V_BFE_U32::~Inst_VOP3__V_BFE_U32()
-    {
-    } // ~Inst_VOP3__V_BFE_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
-    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
-    void
-    Inst_VOP3__V_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
-                    & ((1 << bits(src2[lane], 4, 0)) - 1);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BFE_I32 class methods ---
-
-    Inst_VOP3__V_BFE_I32::Inst_VOP3__V_BFE_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfe_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFE_I32
-
-    Inst_VOP3__V_BFE_I32::~Inst_VOP3__V_BFE_I32()
-    {
-    } // ~Inst_VOP3__V_BFE_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
-    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
-    void
-    Inst_VOP3__V_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
-                    & ((1 << bits(src2[lane], 4, 0)) - 1);
-
-                // Above extracted a signed int of size src2 bits which needs
-                // to be signed-extended. Check if the MSB of our src2-bit
-                // integer is 1, and sign extend it is.
-                if (vdst[lane] >> (bits(src2[lane], 4, 0) - 1)) {
-                    vdst[lane] |= 0xffffffff << bits(src2[lane], 4, 0);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BFI_B32 class methods ---
-
-    Inst_VOP3__V_BFI_B32::Inst_VOP3__V_BFI_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfi_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFI_B32
-
-    Inst_VOP3__V_BFI_B32::~Inst_VOP3__V_BFI_B32()
-    {
-    } // ~Inst_VOP3__V_BFI_B32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
-    void
-    Inst_VOP3__V_BFI_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
-                    & src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FMA_F32 class methods ---
-
-    Inst_VOP3__V_FMA_F32::Inst_VOP3__V_FMA_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fma_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F32
-
-    Inst_VOP3__V_FMA_F32::~Inst_VOP3__V_FMA_F32()
-    {
-    } // ~Inst_VOP3__V_FMA_F32
-
-    // --- description from .arch file ---
-    // D.f = S0.f * S1.f + S2.f.
-    void
-    Inst_VOP3__V_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FMA_F64 class methods ---
-
-    Inst_VOP3__V_FMA_F64::Inst_VOP3__V_FMA_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fma_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F64
-
-    Inst_VOP3__V_FMA_F64::~Inst_VOP3__V_FMA_F64()
-    {
-    } // ~Inst_VOP3__V_FMA_F64
-
-    // --- description from .arch file ---
-    // D.d = S0.d * S1.d + S2.d.
-    void
-    Inst_VOP3__V_FMA_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LERP_U8 class methods ---
-
-    Inst_VOP3__V_LERP_U8::Inst_VOP3__V_LERP_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lerp_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LERP_U8
-
-    Inst_VOP3__V_LERP_U8::~Inst_VOP3__V_LERP_U8()
-    {
-    } // ~Inst_VOP3__V_LERP_U8
-
-    // --- description from .arch file ---
-    // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
-    // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
-    // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
-    // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
-    // Unsigned 8-bit pixel average on packed unsigned bytes (linear
-    // ---  interpolation). S2 acts as a round mode; if set, 0.5 rounds up,
-    // ---  otherwise 0.5 truncates.
-    void
-    Inst_VOP3__V_LERP_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ((bits(src0[lane], 31, 24)
-                    + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
-                        << 24;
-                vdst[lane] += ((bits(src0[lane], 23, 16)
-                    + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
-                        << 16;
-                vdst[lane] += ((bits(src0[lane], 15, 8)
-                    + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
-                        << 8;
-                vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
-                    + bits(src2[lane], 0)) >> 1);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ALIGNBIT_B32 class methods ---
-
-    Inst_VOP3__V_ALIGNBIT_B32::Inst_VOP3__V_ALIGNBIT_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_alignbit_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ALIGNBIT_B32
-
-    Inst_VOP3__V_ALIGNBIT_B32::~Inst_VOP3__V_ALIGNBIT_B32()
-    {
-    } // ~Inst_VOP3__V_ALIGNBIT_B32
-
-    // --- description from .arch file ---
-    // D.u = ({S0,S1} >> S2.u[4:0]) & 0xffffffff.
-    void
-    Inst_VOP3__V_ALIGNBIT_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
-                    | (VecElemU64)src1[lane]);
-                vdst[lane] = (VecElemU32)((src_0_1
-                    >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ALIGNBYTE_B32 class methods ---
-
-    Inst_VOP3__V_ALIGNBYTE_B32::Inst_VOP3__V_ALIGNBYTE_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_alignbyte_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ALIGNBYTE_B32
-
-    Inst_VOP3__V_ALIGNBYTE_B32::~Inst_VOP3__V_ALIGNBYTE_B32()
-    {
-    } // ~Inst_VOP3__V_ALIGNBYTE_B32
-
-    // --- description from .arch file ---
-    // D.u = ({S0,S1} >> (8*S2.u[4:0])) & 0xffffffff.
-    void
-    Inst_VOP3__V_ALIGNBYTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
-                    | (VecElemU64)src1[lane]);
-                vdst[lane] = (VecElemU32)((src_0_1
-                    >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
-                        & 0xffffffff);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN3_F32 class methods ---
-
-    Inst_VOP3__V_MIN3_F32::Inst_VOP3__V_MIN3_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MIN3_F32
-
-    Inst_VOP3__V_MIN3_F32::~Inst_VOP3__V_MIN3_F32()
-    {
-    } // ~Inst_VOP3__V_MIN3_F32
-
-    // --- description from .arch file ---
-    // D.f = min(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MIN3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
-                vdst[lane] = std::fmin(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN3_I32 class methods ---
-
-    Inst_VOP3__V_MIN3_I32::Inst_VOP3__V_MIN3_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN3_I32
-
-    Inst_VOP3__V_MIN3_I32::~Inst_VOP3__V_MIN3_I32()
-    {
-    } // ~Inst_VOP3__V_MIN3_I32
-
-    // --- description from .arch file ---
-    // D.i = min(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MIN3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
-                vdst[lane] = std::min(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN3_U32 class methods ---
-
-    Inst_VOP3__V_MIN3_U32::Inst_VOP3__V_MIN3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MIN3_U32
-
-    Inst_VOP3__V_MIN3_U32::~Inst_VOP3__V_MIN3_U32()
-    {
-    } // ~Inst_VOP3__V_MIN3_U32
-
-    // --- description from .arch file ---
-    // D.u = min(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MIN3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
-                vdst[lane] = std::min(min_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX3_F32 class methods ---
-
-    Inst_VOP3__V_MAX3_F32::Inst_VOP3__V_MAX3_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MAX3_F32
-
-    Inst_VOP3__V_MAX3_F32::~Inst_VOP3__V_MAX3_F32()
-    {
-    } // ~Inst_VOP3__V_MAX3_F32
-
-    // --- description from .arch file ---
-    // D.f = max(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MAX3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
-                vdst[lane] = std::fmax(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX3_I32 class methods ---
-
-    Inst_VOP3__V_MAX3_I32::Inst_VOP3__V_MAX3_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX3_I32
-
-    Inst_VOP3__V_MAX3_I32::~Inst_VOP3__V_MAX3_I32()
-    {
-    } // ~Inst_VOP3__V_MAX3_I32
-
-    // --- description from .arch file ---
-    // D.i = max(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MAX3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
-                vdst[lane] = std::max(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX3_U32 class methods ---
-
-    Inst_VOP3__V_MAX3_U32::Inst_VOP3__V_MAX3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MAX3_U32
-
-    Inst_VOP3__V_MAX3_U32::~Inst_VOP3__V_MAX3_U32()
-    {
-    } // ~Inst_VOP3__V_MAX3_U32
-
-    // --- description from .arch file ---
-    // D.u = max(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MAX3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
-                vdst[lane] = std::max(max_0_1, src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MED3_F32 class methods ---
-
-    Inst_VOP3__V_MED3_F32::Inst_VOP3__V_MED3_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_med3_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_MED3_F32
-
-    Inst_VOP3__V_MED3_F32::~Inst_VOP3__V_MED3_F32()
-    {
-    } // ~Inst_VOP3__V_MED3_F32
-
-    // --- description from .arch file ---
-    // D.f = median(S0.f, S1.f, S2.f).
-    void
-    Inst_VOP3__V_MED3_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MED3_I32 class methods ---
-
-    Inst_VOP3__V_MED3_I32::Inst_VOP3__V_MED3_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_med3_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MED3_I32
-
-    Inst_VOP3__V_MED3_I32::~Inst_VOP3__V_MED3_I32()
-    {
-    } // ~Inst_VOP3__V_MED3_I32
-
-    // --- description from .arch file ---
-    // D.i = median(S0.i, S1.i, S2.i).
-    void
-    Inst_VOP3__V_MED3_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MED3_U32 class methods ---
-
-    Inst_VOP3__V_MED3_U32::Inst_VOP3__V_MED3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_med3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MED3_U32
-
-    Inst_VOP3__V_MED3_U32::~Inst_VOP3__V_MED3_U32()
-    {
-    } // ~Inst_VOP3__V_MED3_U32
-
-    // --- description from .arch file ---
-    // D.u = median(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MED3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_U8 class methods ---
-
-    Inst_VOP3__V_SAD_U8::Inst_VOP3__V_SAD_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U8
-
-    Inst_VOP3__V_SAD_U8::~Inst_VOP3__V_SAD_U8()
-    {
-    } // ~Inst_VOP3__V_SAD_U8
-
-    // --- description from .arch file ---
-    // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
-    // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
-    // Sum of absolute differences with accumulation, overflow into upper bits
-    // is allowed.
-    void
-    Inst_VOP3__V_SAD_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(bits(src0[lane], 31, 24)
-                    - bits(src1[lane], 31, 24))
-                    + std::abs(bits(src0[lane], 23, 16)
-                    - bits(src1[lane], 23, 16))
-                    + std::abs(bits(src0[lane], 15, 8)
-                    - bits(src1[lane], 15, 8))
-                    + std::abs(bits(src0[lane], 7, 0)
-                    - bits(src1[lane], 7, 0)) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_HI_U8 class methods ---
-
-    Inst_VOP3__V_SAD_HI_U8::Inst_VOP3__V_SAD_HI_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_hi_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_HI_U8
-
-    Inst_VOP3__V_SAD_HI_U8::~Inst_VOP3__V_SAD_HI_U8()
-    {
-    } // ~Inst_VOP3__V_SAD_HI_U8
-
-    // --- description from .arch file ---
-    // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
-    // Sum of absolute differences with accumulation, overflow is lost.
-    void
-    Inst_VOP3__V_SAD_HI_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (((bits(src0[lane], 31, 24)
-                    - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
-                    - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
-                    - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
-                    - bits(src1[lane], 7, 0))) << 16) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_U16 class methods ---
-
-    Inst_VOP3__V_SAD_U16::Inst_VOP3__V_SAD_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_u16", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U16
-
-    Inst_VOP3__V_SAD_U16::~Inst_VOP3__V_SAD_U16()
-    {
-    } // ~Inst_VOP3__V_SAD_U16
-
-    // --- description from .arch file ---
-    // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
-    // + S2.u.
-    // Word SAD with accumulation.
-    void
-    Inst_VOP3__V_SAD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(bits(src0[lane], 31, 16)
-                    - bits(src1[lane], 31, 16))
-                    + std::abs(bits(src0[lane], 15, 0)
-                    - bits(src1[lane], 15, 0)) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_SAD_U32 class methods ---
-
-    Inst_VOP3__V_SAD_U32::Inst_VOP3__V_SAD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_sad_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_SAD_U32
-
-    Inst_VOP3__V_SAD_U32::~Inst_VOP3__V_SAD_U32()
-    {
-    } // ~Inst_VOP3__V_SAD_U32
-
-    // --- description from .arch file ---
-    // D.u = abs(S0.i - S1.i) + S2.u.
-    // Dword SAD with accumulation.
-    void
-    Inst_VOP3__V_SAD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
-            } // if
-        } // for
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PK_U8_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PK_U8_F32::Inst_VOP3__V_CVT_PK_U8_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pk_u8_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PK_U8_F32
-
-    Inst_VOP3__V_CVT_PK_U8_F32::~Inst_VOP3__V_CVT_PK_U8_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_U8_F32
-
-    // --- description from .arch file ---
-    // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
-    // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
-    // Convert floating point value S0 to 8-bit unsigned integer and pack the
-    // result into byte S1 of dword S2.
-    void
-    Inst_VOP3__V_CVT_PK_U8_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
-                    << (8 * bits(src1[lane], 1, 0)))
-                    | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FIXUP_F32 class methods ---
-
-    Inst_VOP3__V_DIV_FIXUP_F32::Inst_VOP3__V_DIV_FIXUP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fixup_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_DIV_FIXUP_F32
-
-    Inst_VOP3__V_DIV_FIXUP_F32::~Inst_VOP3__V_DIV_FIXUP_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F32
-
-    // --- description from .arch file ---
-    // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
-    // s2.f = Numerator. This opcode generates exceptions resulting from the
-    // division operation.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::fpclassify(src1[lane]) == FP_ZERO) {
-                    if (std::signbit(src1[lane])) {
-                        vdst[lane] = -INFINITY;
-                    } else {
-                        vdst[lane] = +INFINITY;
-                    }
-                } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if (std::isinf(src1[lane])) {
-                    if (std::signbit(src1[lane])) {
-                        vdst[lane] = -INFINITY;
-                    } else {
-                        vdst[lane] = +INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src2[lane] / src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---
-
-    Inst_VOP3__V_DIV_FIXUP_F64::Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fixup_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_DIV_FIXUP_F64
-
-    Inst_VOP3__V_DIV_FIXUP_F64::~Inst_VOP3__V_DIV_FIXUP_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F64
-
-    // --- description from .arch file ---
-    // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
-    // s2.d = Numerator. This opcode generates exceptions resulting from the
-    // division operation.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int sign_out = std::signbit(src1[lane])
-                              ^ std::signbit(src2[lane]);
-                int exp1(0);
-                int exp2(0);
-                std::frexp(src1[lane], &exp1);
-                std::frexp(src2[lane], &exp2);
-
-                if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
-                    vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
-                } else if (std::fpclassify(src1[lane]) == FP_ZERO
-                           && std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane]
-                        = std::numeric_limits<VecElemF64>::signaling_NaN();
-                } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
-                    vdst[lane]
-                        = std::numeric_limits<VecElemF64>::signaling_NaN();
-                } else if (std::fpclassify(src1[lane]) == FP_ZERO
-                           || std::isinf(src2[lane])) {
-                    vdst[lane] = sign_out ? -INFINITY : +INFINITY;
-                } else if (std::isinf(src1[lane])
-                           || std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane] = sign_out ? -0.0 : +0.0;
-                } else if (exp2 - exp1 < -1075) {
-                    vdst[lane] = src0[lane];
-                } else if (exp1 == 2047) {
-                    vdst[lane] = src0[lane];
-                } else {
-                    vdst[lane] = sign_out ? -std::fabs(src0[lane])
-                        : std::fabs(src0[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_SCALE_F32 class methods ---
-
-    Inst_VOP3__V_DIV_SCALE_F32::Inst_VOP3__V_DIV_SCALE_F32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_div_scale_f32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(F32);
-    } // Inst_VOP3__V_DIV_SCALE_F32
-
-    Inst_VOP3__V_DIV_SCALE_F32::~Inst_VOP3__V_DIV_SCALE_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_SCALE_F32
-
-    // --- description from .arch file ---
-    // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
-    // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
-    // numerator and denominator, this opcode will appropriately scale inputs
-    // for division to avoid subnormal terms during Newton-Raphson correction
-    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
-    void
-    Inst_VOP3__V_DIV_SCALE_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane];
-                vcc.setBit(lane, 0);
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---
-
-    Inst_VOP3__V_DIV_SCALE_F64::Inst_VOP3__V_DIV_SCALE_F64(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_div_scale_f64")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(F64);
-    } // Inst_VOP3__V_DIV_SCALE_F64
-
-    Inst_VOP3__V_DIV_SCALE_F64::~Inst_VOP3__V_DIV_SCALE_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_SCALE_F64
-
-    // --- description from .arch file ---
-    // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
-    // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
-    // numerator and denominator, this opcode will appropriately scale inputs
-    // for division to avoid subnormal terms during Newton-Raphson correction
-    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
-    void
-    Inst_VOP3__V_DIV_SCALE_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                int exp1(0);
-                int exp2(0);
-                std::frexp(src1[lane], &exp1);
-                std::frexp(src2[lane], &exp2);
-                vcc.setBit(lane, 0);
-
-                if (std::fpclassify(src1[lane]) == FP_ZERO
-                    || std::fpclassify(src2[lane]) == FP_ZERO) {
-                    vdst[lane] = NAN;
-                } else if (exp2 - exp1 >= 768) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src1[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
-                    vdst[lane] = std::ldexp(src0[lane], 128);
-                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
-                           && std::fpclassify(src2[lane] / src1[lane])
-                           == FP_SUBNORMAL) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src1[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
-                    vdst[lane] = std::ldexp(src0[lane], -128);
-                } else if (std::fpclassify(src2[lane] / src1[lane])
-                           == FP_SUBNORMAL) {
-                    vcc.setBit(lane, 1);
-                    if (src0[lane] == src2[lane]) {
-                        vdst[lane] = std::ldexp(src0[lane], 128);
-                    }
-                } else if (exp2 <= 53) {
-                    vdst[lane] = std::ldexp(src0[lane], 128);
-                }
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FMAS_F32 class methods ---
-
-    Inst_VOP3__V_DIV_FMAS_F32::Inst_VOP3__V_DIV_FMAS_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fmas_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-        setFlag(F32);
-        setFlag(FMA);
-    } // Inst_VOP3__V_DIV_FMAS_F32
-
-    Inst_VOP3__V_DIV_FMAS_F32::~Inst_VOP3__V_DIV_FMAS_F32()
-    {
-    } // ~Inst_VOP3__V_DIV_FMAS_F32
-
-    // --- description from .arch file ---
-    // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
-    // s1.f = Denominator, s2.f = Numerator)
-    void
-    Inst_VOP3__V_DIV_FMAS_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-            }
-        }
-
-        //vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---
-
-    Inst_VOP3__V_DIV_FMAS_F64::Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fmas_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(ReadsVCC);
-        setFlag(F64);
-        setFlag(FMA);
-    } // Inst_VOP3__V_DIV_FMAS_F64
-
-    Inst_VOP3__V_DIV_FMAS_F64::~Inst_VOP3__V_DIV_FMAS_F64()
-    {
-    } // ~Inst_VOP3__V_DIV_FMAS_F64
-
-    // --- description from .arch file ---
-    // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
-    // s1.d = Denominator, s2.d = Numerator)
-    void
-    Inst_VOP3__V_DIV_FMAS_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-        vcc.read();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (instData.ABS & 0x4) {
-            src2.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        if (extData.NEG & 0x4) {
-            src2.negModifier();
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (bits(vcc.rawData(), lane)) {
-                    vdst[lane] = std::pow(2, 64)
-                        * std::fma(src0[lane], src1[lane], src2[lane]);
-                } else {
-                    vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MSAD_U8 class methods ---
-
-    Inst_VOP3__V_MSAD_U8::Inst_VOP3__V_MSAD_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_msad_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MSAD_U8
-
-    Inst_VOP3__V_MSAD_U8::~Inst_VOP3__V_MSAD_U8()
-    {
-    } // ~Inst_VOP3__V_MSAD_U8
-
-    // --- description from .arch file ---
-    // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
-    void
-    Inst_VOP3__V_MSAD_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_QSAD_PK_U16_U8 class methods ---
-
-    Inst_VOP3__V_QSAD_PK_U16_U8::Inst_VOP3__V_QSAD_PK_U16_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_qsad_pk_u16_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_QSAD_PK_U16_U8
-
-    Inst_VOP3__V_QSAD_PK_U16_U8::~Inst_VOP3__V_QSAD_PK_U16_U8()
-    {
-    } // ~Inst_VOP3__V_QSAD_PK_U16_U8
-
-    // --- description from .arch file ---
-    // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
-    // S1.u[31:0], S2.u[63:0])
-    void
-    Inst_VOP3__V_QSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MQSAD_PK_U16_U8 class methods ---
-
-    Inst_VOP3__V_MQSAD_PK_U16_U8::Inst_VOP3__V_MQSAD_PK_U16_U8(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mqsad_pk_u16_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MQSAD_PK_U16_U8
-
-    Inst_VOP3__V_MQSAD_PK_U16_U8::~Inst_VOP3__V_MQSAD_PK_U16_U8()
-    {
-    } // ~Inst_VOP3__V_MQSAD_PK_U16_U8
-
-    // --- description from .arch file ---
-    // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
-    // ---  S1.u[31:0], S2.u[63:0])
-    void
-    Inst_VOP3__V_MQSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MQSAD_U32_U8 class methods ---
-
-    Inst_VOP3__V_MQSAD_U32_U8::Inst_VOP3__V_MQSAD_U32_U8(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mqsad_u32_u8", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MQSAD_U32_U8
-
-    Inst_VOP3__V_MQSAD_U32_U8::~Inst_VOP3__V_MQSAD_U32_U8()
-    {
-    } // ~Inst_VOP3__V_MQSAD_U32_U8
-
-    // --- description from .arch file ---
-    // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
-    // ---  S1.u[31:0], S2.u[127:0])
-    void
-    Inst_VOP3__V_MQSAD_U32_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAD_U64_U32 class methods ---
-
-    Inst_VOP3__V_MAD_U64_U32::Inst_VOP3__V_MAD_U64_U32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_mad_u64_u32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U64_U32
-
-    Inst_VOP3__V_MAD_U64_U32::~Inst_VOP3__V_MAD_U64_U32()
-    {
-    } // ~Inst_VOP3__V_MAD_U64_U32
-
-    // --- description from .arch file ---
-    // {vcc_out,D.u64} = S0.u32 * S1.u32 + S2.u64.
-    void
-    Inst_VOP3__V_MAD_U64_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-        vdst.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
-                    src2[lane]));
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_I64_I32 class methods ---
-
-    Inst_VOP3__V_MAD_I64_I32::Inst_VOP3__V_MAD_I64_I32(
-          InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_mad_i64_i32")
-    {
-        setFlag(ALU);
-        setFlag(WritesVCC);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I64_I32
-
-    Inst_VOP3__V_MAD_I64_I32::~Inst_VOP3__V_MAD_I64_I32()
-    {
-    } // ~Inst_VOP3__V_MAD_I64_I32
-
-    // --- description from .arch file ---
-    // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
-    void
-    Inst_VOP3__V_MAD_I64_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
-        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
-        VecOperandI64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
-                    src2[lane]));
-            }
-        }
-
-        vcc.write();
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_XAD_U32 class methods ---
-
-    Inst_VOP3__V_XAD_U32::Inst_VOP3__V_XAD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_xad_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_XAD_U32
-
-    Inst_VOP3__V_XAD_U32::~Inst_VOP3__V_XAD_U32()
-    {
-    } // ~Inst_VOP3__V_XAD_U32
-
-    // --- description from .arch file ---
-    // D.u32 = (S0.u32 ^ S1.u32) + S2.u32.
-    void
-    Inst_VOP3__V_XAD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] ^ src1[lane]) + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHL_ADD_U32 class methods ---
-
-    Inst_VOP3__V_LSHL_ADD_U32::Inst_VOP3__V_LSHL_ADD_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshl_add_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHL_ADD_U32
-
-    Inst_VOP3__V_LSHL_ADD_U32::~Inst_VOP3__V_LSHL_ADD_U32()
-    {
-    } // ~Inst_VOP3__V_LSHL_ADD_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u << S1.u[4:0]) + S2.u.
-    void
-    Inst_VOP3__V_LSHL_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
-                           + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD_LSHL_U32 class methods ---
-
-    Inst_VOP3__V_ADD_LSHL_U32::Inst_VOP3__V_ADD_LSHL_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_lshl_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD_LSHL_U32
-
-    Inst_VOP3__V_ADD_LSHL_U32::~Inst_VOP3__V_ADD_LSHL_U32()
-    {
-    } // ~Inst_VOP3__V_ADD_LSHL_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u + S1.u) << S2.u[4:0].
-    void
-    Inst_VOP3__V_ADD_LSHL_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] =
-                    (src0[lane] + src1[lane]) << bits(src2[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ADD3_U32 class methods ---
-
-    Inst_VOP3__V_ADD3_U32::Inst_VOP3__V_ADD3_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add3_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ADD3_U32
-
-    Inst_VOP3__V_ADD3_U32::~Inst_VOP3__V_ADD3_U32()
-    {
-    } // ~Inst_VOP3__V_ADD3_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u + S1.u + S2.u.
-    void
-    Inst_VOP3__V_ADD3_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane] + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHL_OR_B32 class methods ---
-
-    Inst_VOP3__V_LSHL_OR_B32::Inst_VOP3__V_LSHL_OR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshl_or_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHL_OR_B32
-
-    Inst_VOP3__V_LSHL_OR_B32::~Inst_VOP3__V_LSHL_OR_B32()
-    {
-    } // ~Inst_VOP3__V_LSHL_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u << S1.u[4:0]) | S2.u.
-    void
-    Inst_VOP3__V_LSHL_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
-                           | src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_AND_OR_B32 class methods ---
-
-    Inst_VOP3__V_AND_OR_B32::Inst_VOP3__V_AND_OR_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_and_or_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_AND_OR_B32
-
-    Inst_VOP3__V_AND_OR_B32::~Inst_VOP3__V_AND_OR_B32()
-    {
-    } // ~Inst_VOP3__V_AND_OR_B32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u & S1.u) | S2.u.
-    // Input and output modifiers not supported.
-    void
-    Inst_VOP3__V_AND_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = (src0[lane] & src1[lane]) | src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_F16 class methods ---
-
-    Inst_VOP3__V_MAD_F16::Inst_VOP3__V_MAD_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_F16
-
-    Inst_VOP3__V_MAD_F16::~Inst_VOP3__V_MAD_F16()
-    {
-    } // ~Inst_VOP3__V_MAD_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + S2.f16.
-    // Supports round mode, exception flags, saturation.
-    void
-    Inst_VOP3__V_MAD_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_MAD_U16 class methods ---
-
-    Inst_VOP3__V_MAD_U16::Inst_VOP3__V_MAD_U16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_u16", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_U16
-
-    Inst_VOP3__V_MAD_U16::~Inst_VOP3__V_MAD_U16()
-    {
-    } // ~Inst_VOP3__V_MAD_U16
-
-    // --- description from .arch file ---
-    // D.u16 = S0.u16 * S1.u16 + S2.u16.
-    // Supports saturation (unsigned 16-bit integer domain).
-    void
-    Inst_VOP3__V_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
-        VecOperandU16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAD_I16 class methods ---
-
-    Inst_VOP3__V_MAD_I16::Inst_VOP3__V_MAD_I16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mad_i16", false)
-    {
-        setFlag(ALU);
-        setFlag(MAD);
-    } // Inst_VOP3__V_MAD_I16
-
-    Inst_VOP3__V_MAD_I16::~Inst_VOP3__V_MAD_I16()
-    {
-    } // ~Inst_VOP3__V_MAD_I16
-
-    // --- description from .arch file ---
-    // D.i16 = S0.i16 * S1.i16 + S2.i16.
-    // Supports saturation (signed 16-bit integer domain).
-    void
-    Inst_VOP3__V_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
-        VecOperandI16 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_PERM_B32 class methods ---
-
-    Inst_VOP3__V_PERM_B32::Inst_VOP3__V_PERM_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_perm_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_PERM_B32
-
-    Inst_VOP3__V_PERM_B32::~Inst_VOP3__V_PERM_B32()
-    {
-    } // ~Inst_VOP3__V_PERM_B32
-
-    // --- description from .arch file ---
-    // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
-    // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
-    // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
-    // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
-    // byte permute(byte in[8], byte sel) {
-    //     if(sel>=13) then return 0xff;
-    //     elsif(sel==12) then return 0x00;
-    //     elsif(sel==11) then return in[7][7] * 0xff;
-    //     elsif(sel==10) then return in[5][7] * 0xff;
-    //     elsif(sel==9) then return in[3][7] * 0xff;
-    //     elsif(sel==8) then return in[1][7] * 0xff;
-    //     else return in[sel];
-    //     }
-    // Byte permute.
-    void
-    Inst_VOP3__V_PERM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-        src2.readSrc();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemU64 selector = (VecElemU64)src0[lane];
-                selector = (selector << 32) | (VecElemU64)src1[lane];
-                vdst[lane] = 0;
-
-                DPRINTF(VEGA, "Executing v_perm_b32 src_0 0x%08x, src_1 "
-                        "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
-                        src1[lane], src2[lane], vdst[lane]);
-                DPRINTF(VEGA, "Selector: 0x%08x \n", selector);
-
-                for (int i = 0; i < 4 ; ++i) {
-                    VecElemU32 permuted_val = permute(selector, 0xFF
-                        & ((VecElemU32)src2[lane] >> (8 * i)));
-                    vdst[lane] |= (permuted_val << (8 * i));
-                }
-
-                DPRINTF(VEGA, "v_perm result: 0x%08x\n", vdst[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_FMA_F16 class methods ---
-
-    Inst_VOP3__V_FMA_F16::Inst_VOP3__V_FMA_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_fma_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-        setFlag(FMA);
-    } // Inst_VOP3__V_FMA_F16
-
-    Inst_VOP3__V_FMA_F16::~Inst_VOP3__V_FMA_F16()
-    {
-    } // ~Inst_VOP3__V_FMA_F16
-
-    // --- description from .arch file ---
-    // D.f16 = S0.f16 * S1.f16 + S2.f16.
-    // Fused half precision multiply add.
-    void
-    Inst_VOP3__V_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_DIV_FIXUP_F16 class methods ---
-
-    Inst_VOP3__V_DIV_FIXUP_F16::Inst_VOP3__V_DIV_FIXUP_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_div_fixup_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_DIV_FIXUP_F16
-
-    Inst_VOP3__V_DIV_FIXUP_F16::~Inst_VOP3__V_DIV_FIXUP_F16()
-    {
-    } // ~Inst_VOP3__V_DIV_FIXUP_F16
-
-    // --- description from .arch file ---
-    // sign_out =  sign(S1.f16)^sign(S2.f16);
-    // if (S2.f16 == NAN)
-    //     D.f16 = Quiet(S2.f16);
-    // else if (S1.f16 == NAN)
-    //     D.f16 = Quiet(S1.f16);
-    // else if (S1.f16 == S2.f16 == 0)
-    //     # 0/0
-    //     D.f16 = pele_nan(0xfe00);
-    // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
-    //     # inf/inf
-    //     D.f16 = pele_nan(0xfe00);
-    // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
-    //     # x/0, or inf/y
-    //     D.f16 = sign_out ? -INF : INF;
-    // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
-    //     # x/inf, 0/y
-    //     D.f16 = sign_out ? -0 : 0;
-    // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
-    //     D.f16 = sign_out ? -underflow : underflow;
-    // else if (exp(S1.f16) == 255)
-    //     D.f16 = sign_out ? -overflow : overflow;
-    // else
-    //     D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
-    // Half precision division fixup.
-    // S0 = Quotient, S1 = Denominator, S3 = Numerator.
-    // Given a numerator, denominator, and quotient from a divide, this opcode
-    // will detect and apply special case numerics, touching up the quotient if
-    // necessary. This opcode also generates invalid, denorm and divide by
-    // zero exceptions caused by the division.
-    void
-    Inst_VOP3__V_DIV_FIXUP_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pkaccum_u8_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKACCUM_U8_F32
-
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::~Inst_VOP3__V_CVT_PKACCUM_U8_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32
-
-    // --- description from .arch file ---
-    // byte = S1.u[1:0]; bit = byte * 8;
-    // D.u[bit+7:bit] = flt32_to_uint8(S0.f);
-    // Pack converted value of S0.f into byte S1 of the destination.
-    // SQ translates to V_CVT_PK_U8_F32.
-    // Note: this opcode uses src_c to pass destination in as a source.
-    void
-    Inst_VOP3__V_CVT_PKACCUM_U8_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P1_F32 class methods ---
-
-    Inst_VOP3__V_INTERP_P1_F32::Inst_VOP3__V_INTERP_P1_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p1_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_P1_F32
-
-    Inst_VOP3__V_INTERP_P1_F32::~Inst_VOP3__V_INTERP_P1_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1_F32
-
-    // --- description from .arch file ---
-    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S; if
-    // D == S then data corruption will occur.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P2_F32 class methods ---
-
-    Inst_VOP3__V_INTERP_P2_F32::Inst_VOP3__V_INTERP_P2_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p2_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_P2_F32
-
-    Inst_VOP3__V_INTERP_P2_F32::~Inst_VOP3__V_INTERP_P2_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_P2_F32
-
-    // --- description from .arch file ---
-    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
-    // V_MAD_F32 for SP).
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_MOV_F32 class methods ---
-
-    Inst_VOP3__V_INTERP_MOV_F32::Inst_VOP3__V_INTERP_MOV_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_mov_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_INTERP_MOV_F32
-
-    Inst_VOP3__V_INTERP_MOV_F32::~Inst_VOP3__V_INTERP_MOV_F32()
-    {
-    } // ~Inst_VOP3__V_INTERP_MOV_F32
-
-    // --- description from .arch file ---
-    // D.f = {P10,P20,P0}[S.u]; parameter load.
-    void
-    Inst_VOP3__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P1LL_F16 class methods ---
-
-    Inst_VOP3__V_INTERP_P1LL_F16::Inst_VOP3__V_INTERP_P1LL_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p1ll_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P1LL_F16
-
-    Inst_VOP3__V_INTERP_P1LL_F16::~Inst_VOP3__V_INTERP_P1LL_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1LL_F16
-
-    // --- description from .arch file ---
-    // D.f32 = P10.f16 * S0.f32 + P0.f16.
-    // 'LL' stands for 'two LDS arguments'.
-    // attr_word selects the high or low half 16 bits of each LDS dword
-    // accessed.
-    // This opcode is available for 32-bank LDS only.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P1LL_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P1LV_F16 class methods ---
-
-    Inst_VOP3__V_INTERP_P1LV_F16::Inst_VOP3__V_INTERP_P1LV_F16(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p1lv_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P1LV_F16
-
-    Inst_VOP3__V_INTERP_P1LV_F16::~Inst_VOP3__V_INTERP_P1LV_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P1LV_F16
-
-    // --- description from .arch file ---
-    // D.f32 = P10.f16 * S0.f32 + (S2.u32 >> (attr_word * 16)).f16.
-    // 'LV' stands for 'One LDS and one VGPR argument'.
-    // S2 holds two parameters, attr_word selects the high or low word of the
-    // VGPR for this calculation, as well as the high or low half of the LDS
-    // data.
-    // Meant for use with 16-bank LDS.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P1LV_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_INTERP_P2_F16 class methods ---
-
-    Inst_VOP3__V_INTERP_P2_F16::Inst_VOP3__V_INTERP_P2_F16(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_interp_p2_f16", false)
-    {
-        setFlag(ALU);
-        setFlag(F16);
-    } // Inst_VOP3__V_INTERP_P2_F16
-
-    Inst_VOP3__V_INTERP_P2_F16::~Inst_VOP3__V_INTERP_P2_F16()
-    {
-    } // ~Inst_VOP3__V_INTERP_P2_F16
-
-    // --- description from .arch file ---
-    // D.f16 = P20.f16 * S0.f32 + S2.f32.
-    // Final computation. attr_word selects LDS high or low 16bits. Used for
-    // both 16- and 32-bank LDS.
-    // Result is always written to the 16 LSBs of the destination VGPR.
-    // NOTE: In textual representations the I/J VGPR is the first source and
-    // the attribute is the second source; however in the VOP3 encoding the
-    // attribute is stored in the src0 field and the VGPR is stored in the
-    // src1 field.
-    void
-    Inst_VOP3__V_INTERP_P2_F16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_ADD_F64 class methods ---
-
-    Inst_VOP3__V_ADD_F64::Inst_VOP3__V_ADD_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_add_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_ADD_F64
-
-    Inst_VOP3__V_ADD_F64::~Inst_VOP3__V_ADD_F64()
-    {
-    } // ~Inst_VOP3__V_ADD_F64
-
-    // --- description from .arch file ---
-    // D.d = S0.d + S1.d.
-    void
-    Inst_VOP3__V_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane]) ) {
-                        vdst[lane] = NAN;
-                } else if (std::isinf(src0[lane]) &&
-                           std::isinf(src1[lane])) {
-                    if (std::signbit(src0[lane]) !=
-                        std::signbit(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else {
-                        vdst[lane] = src0[lane];
-                    }
-                } else if (std::isinf(src0[lane])) {
-                    vdst[lane] = src0[lane];
-                } else if (std::isinf(src1[lane])) {
-                    vdst[lane] = src1[lane];
-                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        if (std::signbit(src0[lane]) &&
-                            std::signbit(src1[lane])) {
-                            vdst[lane] = -0.0;
-                        } else {
-                            vdst[lane] = 0.0;
-                        }
-                    } else {
-                        vdst[lane] = src1[lane];
-                    }
-                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src1[lane]) == FP_ZERO) {
-                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src0[lane]) == FP_ZERO) {
-                        if (std::signbit(src0[lane]) &&
-                            std::signbit(src1[lane])) {
-                            vdst[lane] = -0.0;
-                        } else {
-                            vdst[lane] = 0.0;
-                        }
-                    } else {
-                        vdst[lane] = src0[lane];
-                    }
-                } else {
-                    vdst[lane] = src0[lane] + src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_F64 class methods ---
-
-    Inst_VOP3__V_MUL_F64::Inst_VOP3__V_MUL_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MUL_F64
-
-    Inst_VOP3__V_MUL_F64::~Inst_VOP3__V_MUL_F64()
-    {
-    } // ~Inst_VOP3__V_MUL_F64
-
-    // --- description from .arch file ---
-    // D.d = S0.d * S1.d.
-    void
-    Inst_VOP3__V_MUL_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) ||
-                    std::isnan(src1[lane])) {
-                    vdst[lane] = NAN;
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
-                           std::fpclassify(src0[lane]) == FP_ZERO) &&
-                           std::signbit(src0[lane])) {
-                    if (std::isinf(src1[lane])) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +0.0;
-                    } else {
-                        vdst[lane] = -0.0;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           !std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (!std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else if (std::isinf(src0[lane]) &&
-                           std::signbit(src0[lane])) {
-                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
-                        std::fpclassify(src1[lane]) == FP_ZERO) {
-                        vdst[lane] = NAN;
-                    } else if (std::signbit(src1[lane])) {
-                        vdst[lane] = +INFINITY;
-                    } else {
-                        vdst[lane] = -INFINITY;
-                    }
-                } else {
-                    vdst[lane] = src0[lane] * src1[lane];
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MIN_F64 class methods ---
-
-    Inst_VOP3__V_MIN_F64::Inst_VOP3__V_MIN_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_min_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MIN_F64
-
-    Inst_VOP3__V_MIN_F64::~Inst_VOP3__V_MIN_F64()
-    {
-    } // ~Inst_VOP3__V_MIN_F64
-
-    // --- description from .arch file ---
-    // D.d = min(S0.d, S1.d).
-    void
-    Inst_VOP3__V_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmin(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MAX_F64 class methods ---
-
-    Inst_VOP3__V_MAX_F64::Inst_VOP3__V_MAX_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_max_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_MAX_F64
-
-    Inst_VOP3__V_MAX_F64::~Inst_VOP3__V_MAX_F64()
-    {
-    } // ~Inst_VOP3__V_MAX_F64
-
-    // --- description from .arch file ---
-    // D.d = max(S0.d, S1.d).
-    void
-    Inst_VOP3__V_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (instData.ABS & 0x2) {
-            src1.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        if (extData.NEG & 0x2) {
-            src1.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::fmax(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LDEXP_F64 class methods ---
-
-    Inst_VOP3__V_LDEXP_F64::Inst_VOP3__V_LDEXP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ldexp_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_LDEXP_F64
-
-    Inst_VOP3__V_LDEXP_F64::~Inst_VOP3__V_LDEXP_F64()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F64
-
-    // --- description from .arch file ---
-    // D.d = pow(S0.d, S1.i[31:0]).
-    void
-    Inst_VOP3__V_LDEXP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        if (instData.ABS & 0x1) {
-            src0.absModifier();
-        }
-
-        if (extData.NEG & 0x1) {
-            src0.negModifier();
-        }
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
-                    vdst[lane] = src0[lane];
-                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
-                           || std::fpclassify(src0[lane]) == FP_ZERO) {
-                    if (std::signbit(src0[lane])) {
-                        vdst[lane] = -0.0;
-                    } else {
-                        vdst[lane] = +0.0;
-                    }
-                } else {
-                    vdst[lane] = std::ldexp(src0[lane], src1[lane]);
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_LO_U32 class methods ---
-
-    Inst_VOP3__V_MUL_LO_U32::Inst_VOP3__V_MUL_LO_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_lo_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_LO_U32
-
-    Inst_VOP3__V_MUL_LO_U32::~Inst_VOP3__V_MUL_LO_U32()
-    {
-    } // ~Inst_VOP3__V_MUL_LO_U32
-
-    // --- description from .arch file ---
-    // D.u = S0.u * S1.u.
-    void
-    Inst_VOP3__V_MUL_LO_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_U32 class methods ---
-
-    Inst_VOP3__V_MUL_HI_U32::Inst_VOP3__V_MUL_HI_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_U32
-
-    Inst_VOP3__V_MUL_HI_U32::~Inst_VOP3__V_MUL_HI_U32()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_U32
-
-    // --- description from .arch file ---
-    // D.u = (S0.u * S1.u) >> 32.
-    void
-    Inst_VOP3__V_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane]
-                    = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MUL_HI_I32 class methods ---
-
-    Inst_VOP3__V_MUL_HI_I32::Inst_VOP3__V_MUL_HI_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mul_hi_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MUL_HI_I32
-
-    Inst_VOP3__V_MUL_HI_I32::~Inst_VOP3__V_MUL_HI_I32()
-    {
-    } // ~Inst_VOP3__V_MUL_HI_I32
-
-    // --- description from .arch file ---
-    // D.i = (S0.i * S1.i) >> 32.
-    void
-    Inst_VOP3__V_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandI32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                VecElemI64 s0 = (VecElemI64)src0[lane];
-                VecElemI64 s1 = (VecElemI64)src1[lane];
-                vdst[lane]
-                    = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LDEXP_F32 class methods ---
-
-    Inst_VOP3__V_LDEXP_F32::Inst_VOP3__V_LDEXP_F32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ldexp_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_LDEXP_F32
-
-    Inst_VOP3__V_LDEXP_F32::~Inst_VOP3__V_LDEXP_F32()
-    {
-    } // ~Inst_VOP3__V_LDEXP_F32
-
-    // --- description from .arch file ---
-    // D.f = pow(S0.f, S1.i)
-    void
-    Inst_VOP3__V_LDEXP_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
-        VecOperandF32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = std::ldexp(src0[lane], src1[lane]);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_READLANE_B32 class methods ---
-
-    Inst_VOP3__V_READLANE_B32::Inst_VOP3__V_READLANE_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_readlane_b32", true)
-    {
-        setFlag(ALU);
-        setFlag(IgnoreExec);
-    } // Inst_VOP3__V_READLANE_B32
-
-    Inst_VOP3__V_READLANE_B32::~Inst_VOP3__V_READLANE_B32()
-    {
-    } // ~Inst_VOP3__V_READLANE_B32
-
-    // --- description from .arch file ---
-    // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
-    // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
-    // Input and output modifiers not supported; this is an untyped operation.
-    void
-    Inst_VOP3__V_READLANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
-        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        sdst = src0[src1.rawData() & 0x3f];
-
-        sdst.write();
-    } // execute
-    // --- Inst_VOP3__V_WRITELANE_B32 class methods ---
-
-    Inst_VOP3__V_WRITELANE_B32::Inst_VOP3__V_WRITELANE_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_writelane_b32", false)
-    {
-        setFlag(ALU);
-        setFlag(IgnoreExec);
-    } // Inst_VOP3__V_WRITELANE_B32
-
-    Inst_VOP3__V_WRITELANE_B32::~Inst_VOP3__V_WRITELANE_B32()
-    {
-    } // ~Inst_VOP3__V_WRITELANE_B32
-
-    // --- description from .arch file ---
-    // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
-    // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
-    // exec mask.
-    // Input and output modifiers not supported; this is an untyped operation.
-    // SQ translates to V_MOV_B32.
-    void
-    Inst_VOP3__V_WRITELANE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.read();
-        src1.read();
-        vdst.read();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        vdst[src1.rawData() & 0x3f] = src0.rawData();
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_BCNT_U32_B32 class methods ---
-
-    Inst_VOP3__V_BCNT_U32_B32::Inst_VOP3__V_BCNT_U32_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bcnt_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BCNT_U32_B32
-
-    Inst_VOP3__V_BCNT_U32_B32::~Inst_VOP3__V_BCNT_U32_B32()
-    {
-    } // ~Inst_VOP3__V_BCNT_U32_B32
-
-    // --- description from .arch file ---
-    // D.u = CountOneBits(S0.u) + S1.u. Bit count.
-    void
-    Inst_VOP3__V_BCNT_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = popCount(src0[lane]) + src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MBCNT_LO_U32_B32 class methods ---
-
-    Inst_VOP3__V_MBCNT_LO_U32_B32::Inst_VOP3__V_MBCNT_LO_U32_B32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mbcnt_lo_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MBCNT_LO_U32_B32
-
-    Inst_VOP3__V_MBCNT_LO_U32_B32::~Inst_VOP3__V_MBCNT_LO_U32_B32()
-    {
-    } // ~Inst_VOP3__V_MBCNT_LO_U32_B32
-
-    // --- description from .arch file ---
-    // ThreadMask = (1 << ThreadPosition) - 1;
-    // D.u = CountOneBits(S0.u & ThreadMask[31:0]) + S1.u.
-    // Masked bit count, ThreadPosition is the position of this thread in the
-    // ---  wavefront (in 0..63).
-    void
-    Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        uint64_t threadMask = 0;
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
-                vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
-                             src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---
-
-    Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_mbcnt_hi_u32_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_MBCNT_HI_U32_B32
-
-    Inst_VOP3__V_MBCNT_HI_U32_B32::~Inst_VOP3__V_MBCNT_HI_U32_B32()
-    {
-    } // ~Inst_VOP3__V_MBCNT_HI_U32_B32
-
-    // --- description from .arch file ---
-    // ThreadMask = (1 << ThreadPosition) - 1;
-    // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
-    // Masked bit count, ThreadPosition is the position of this thread in the
-    // ---  wavefront (in 0..63).
-    void
-    Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-        uint64_t threadMask = 0;
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
-                vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
-                             src1[lane];
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHLREV_B64 class methods ---
-
-    Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshlrev_b64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHLREV_B64
-
-    Inst_VOP3__V_LSHLREV_B64::~Inst_VOP3__V_LSHLREV_B64()
-    {
-    } // ~Inst_VOP3__V_LSHLREV_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S1.u64 << S0.u[5:0].
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHLREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_LSHRREV_B64 class methods ---
-
-    Inst_VOP3__V_LSHRREV_B64::Inst_VOP3__V_LSHRREV_B64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_lshrrev_b64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_LSHRREV_B64
-
-    Inst_VOP3__V_LSHRREV_B64::~Inst_VOP3__V_LSHRREV_B64()
-    {
-    } // ~Inst_VOP3__V_LSHRREV_B64
-
-    // --- description from .arch file ---
-    // D.u64 = S1.u64 >> S0.u[5:0].
-    // The vacated bits are set to zero.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_LSHRREV_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_ASHRREV_I64 class methods ---
-
-    Inst_VOP3__V_ASHRREV_I64::Inst_VOP3__V_ASHRREV_I64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_ashrrev_i64", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_ASHRREV_I64
-
-    Inst_VOP3__V_ASHRREV_I64::~Inst_VOP3__V_ASHRREV_I64()
-    {
-    } // ~Inst_VOP3__V_ASHRREV_I64
-
-    // --- description from .arch file ---
-    // D.u64 = signext(S1.u64) >> S0.u[5:0].
-    // The vacated bits are set to the sign bit of the input value.
-    // SQ translates this to an internal SP opcode.
-    void
-    Inst_VOP3__V_ASHRREV_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
-        VecOperandU64 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane]
-                    = src1[lane] >> bits(src0[lane], 5, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_TRIG_PREOP_F64 class methods ---
-
-    Inst_VOP3__V_TRIG_PREOP_F64::Inst_VOP3__V_TRIG_PREOP_F64(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_trig_preop_f64", false)
-    {
-        setFlag(ALU);
-        setFlag(F64);
-    } // Inst_VOP3__V_TRIG_PREOP_F64
-
-    Inst_VOP3__V_TRIG_PREOP_F64::~Inst_VOP3__V_TRIG_PREOP_F64()
-    {
-    } // ~Inst_VOP3__V_TRIG_PREOP_F64
-
-    // --- description from .arch file ---
-    // D.d = Look Up 2/PI (S0.d) with segment select S1.u[4:0]. This operation
-    // returns an aligned, double precision segment of 2/PI needed to do range
-    // reduction on S0.d (double-precision value). Multiple segments can be
-    // specified through S1.u[4:0]. Rounding is always round-to-zero. Large
-    // inputs (exp > 1968) are scaled to avoid loss of precision through
-    // denormalization.
-    void
-    Inst_VOP3__V_TRIG_PREOP_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_BFM_B32 class methods ---
-
-    Inst_VOP3__V_BFM_B32::Inst_VOP3__V_BFM_B32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_bfm_b32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_BFM_B32
-
-    Inst_VOP3__V_BFM_B32::~Inst_VOP3__V_BFM_B32()
-    {
-    } // ~Inst_VOP3__V_BFM_B32
-
-    // --- description from .arch file ---
-    // D.u = ((1<<S0.u[4:0])-1) << S1.u[4:0]; S0 is the bitfield width and S1
-    // is the bitfield offset.
-    void
-    Inst_VOP3__V_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.readSrc();
-
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
-                    << bits(src1[lane], 4, 0);
-            }
-        }
-
-        vdst.write();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKNORM_I16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::Inst_VOP3__V_CVT_PKNORM_I16_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pknorm_i16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKNORM_I16_F32
-
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::~Inst_VOP3__V_CVT_PKNORM_I16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32
-
-    // --- description from .arch file ---
-    // D = {(snorm)S1.f, (snorm)S0.f}.
-    void
-    Inst_VOP3__V_CVT_PKNORM_I16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKNORM_U16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::Inst_VOP3__V_CVT_PKNORM_U16_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pknorm_u16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKNORM_U16_F32
-
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::~Inst_VOP3__V_CVT_PKNORM_U16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32
-
-    // --- description from .arch file ---
-    // D = {(unorm)S1.f, (unorm)S0.f}.
-    void
-    Inst_VOP3__V_CVT_PKNORM_U16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PKRTZ_F16_F32 class methods ---
-
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::Inst_VOP3__V_CVT_PKRTZ_F16_F32(
-          InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pkrtz_f16_f32", false)
-    {
-        setFlag(ALU);
-        setFlag(F32);
-    } // Inst_VOP3__V_CVT_PKRTZ_F16_F32
-
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::~Inst_VOP3__V_CVT_PKRTZ_F16_F32()
-    {
-    } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32
-
-    // --- description from .arch file ---
-    // D = {flt32_to_flt16(S1.f),flt32_to_flt16(S0.f)}, with round-toward-zero
-    // ---  regardless of current round mode setting in hardware.
-    // This opcode is intended for use with 16-bit compressed exports.
-    // See V_CVT_F16_F32 for a version that respects the current rounding mode.
-    void
-    Inst_VOP3__V_CVT_PKRTZ_F16_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PK_U16_U32 class methods ---
-
-    Inst_VOP3__V_CVT_PK_U16_U32::Inst_VOP3__V_CVT_PK_U16_U32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pk_u16_u32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CVT_PK_U16_U32
-
-    Inst_VOP3__V_CVT_PK_U16_U32::~Inst_VOP3__V_CVT_PK_U16_U32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_U16_U32
-
-    // --- description from .arch file ---
-    // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
-    void
-    Inst_VOP3__V_CVT_PK_U16_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_VOP3__V_CVT_PK_I16_I32 class methods ---
-
-    Inst_VOP3__V_CVT_PK_I16_I32::Inst_VOP3__V_CVT_PK_I16_I32(InFmt_VOP3A *iFmt)
-        : Inst_VOP3A(iFmt, "v_cvt_pk_i16_i32", false)
-    {
-        setFlag(ALU);
-    } // Inst_VOP3__V_CVT_PK_I16_I32
-
-    Inst_VOP3__V_CVT_PK_I16_I32::~Inst_VOP3__V_CVT_PK_I16_I32()
-    {
-    } // ~Inst_VOP3__V_CVT_PK_I16_I32
-
-    // --- description from .arch file ---
-    // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
-    void
-    Inst_VOP3__V_CVT_PK_I16_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_U32 class methods ---
-
-    Inst_DS__DS_ADD_U32::Inst_DS__DS_ADD_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_u32")
-    {
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicAdd);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_ADD_U32
-
-    Inst_DS__DS_ADD_U32::~Inst_DS__DS_ADD_U32()
-    {
-    } // ~Inst_DS__DS_ADD_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] += DATA;
-    void
-    Inst_DS__DS_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_ADD_U32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_ADD_U32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_SUB_U32 class methods ---
-
-    Inst_DS__DS_SUB_U32::Inst_DS__DS_SUB_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_u32")
-    {
-    } // Inst_DS__DS_SUB_U32
-
-    Inst_DS__DS_SUB_U32::~Inst_DS__DS_SUB_U32()
-    {
-    } // ~Inst_DS__DS_SUB_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_U32 class methods ---
-
-    Inst_DS__DS_RSUB_U32::Inst_DS__DS_RSUB_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_u32")
-    {
-    } // Inst_DS__DS_RSUB_U32
-
-    Inst_DS__DS_RSUB_U32::~Inst_DS__DS_RSUB_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_U32 class methods ---
-
-    Inst_DS__DS_INC_U32::Inst_DS__DS_INC_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_u32")
-    {
-    } // Inst_DS__DS_INC_U32
-
-    Inst_DS__DS_INC_U32::~Inst_DS__DS_INC_U32()
-    {
-    } // ~Inst_DS__DS_INC_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_INC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_U32 class methods ---
-
-    Inst_DS__DS_DEC_U32::Inst_DS__DS_DEC_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_u32")
-    {
-    } // Inst_DS__DS_DEC_U32
-
-    Inst_DS__DS_DEC_U32::~Inst_DS__DS_DEC_U32()
-    {
-    } // ~Inst_DS__DS_DEC_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_DEC_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_I32 class methods ---
-
-    Inst_DS__DS_MIN_I32::Inst_DS__DS_MIN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_i32")
-    {
-    } // Inst_DS__DS_MIN_I32
-
-    Inst_DS__DS_MIN_I32::~Inst_DS__DS_MIN_I32()
-    {
-    } // ~Inst_DS__DS_MIN_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_I32 class methods ---
-
-    Inst_DS__DS_MAX_I32::Inst_DS__DS_MAX_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_i32")
-    {
-    } // Inst_DS__DS_MAX_I32
-
-    Inst_DS__DS_MAX_I32::~Inst_DS__DS_MAX_I32()
-    {
-    } // ~Inst_DS__DS_MAX_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_U32 class methods ---
-
-    Inst_DS__DS_MIN_U32::Inst_DS__DS_MIN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_u32")
-    {
-    } // Inst_DS__DS_MIN_U32
-
-    Inst_DS__DS_MIN_U32::~Inst_DS__DS_MIN_U32()
-    {
-    } // ~Inst_DS__DS_MIN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_U32 class methods ---
-
-    Inst_DS__DS_MAX_U32::Inst_DS__DS_MAX_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_u32")
-    {
-    } // Inst_DS__DS_MAX_U32
-
-    Inst_DS__DS_MAX_U32::~Inst_DS__DS_MAX_U32()
-    {
-    } // ~Inst_DS__DS_MAX_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_B32 class methods ---
-
-    Inst_DS__DS_AND_B32::Inst_DS__DS_AND_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_b32")
-    {
-    } // Inst_DS__DS_AND_B32
-
-    Inst_DS__DS_AND_B32::~Inst_DS__DS_AND_B32()
-    {
-    } // ~Inst_DS__DS_AND_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_AND_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_B32 class methods ---
-
-    Inst_DS__DS_OR_B32::Inst_DS__DS_OR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicOr);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_OR_B32
-
-    Inst_DS__DS_OR_B32::~Inst_DS__DS_OR_B32()
-    {
-    } // ~Inst_DS__DS_OR_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] |= DATA;
-    void
-    Inst_DS__DS_OR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_OR_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_OR_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-
-    // --- Inst_DS__DS_XOR_B32 class methods ---
-
-    Inst_DS__DS_XOR_B32::Inst_DS__DS_XOR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_b32")
-    {
-    } // Inst_DS__DS_XOR_B32
-
-    Inst_DS__DS_XOR_B32::~Inst_DS__DS_XOR_B32()
-    {
-    } // ~Inst_DS__DS_XOR_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_B32 class methods ---
-
-    Inst_DS__DS_MSKOR_B32::Inst_DS__DS_MSKOR_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_b32")
-    {
-    } // Inst_DS__DS_MSKOR_B32
-
-    Inst_DS__DS_MSKOR_B32::~Inst_DS__DS_MSKOR_B32()
-    {
-    } // ~Inst_DS__DS_MSKOR_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_B32 class methods ---
-
-    Inst_DS__DS_WRITE_B32::Inst_DS__DS_WRITE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B32
-
-    Inst_DS__DS_WRITE_B32::~Inst_DS__DS_WRITE_B32()
-    {
-    } // ~Inst_DS__DS_WRITE_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] = DATA.
-    // Write dword.
-    void
-    Inst_DS__DS_WRITE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE2_B32 class methods ---
-
-    Inst_DS__DS_WRITE2_B32::Inst_DS__DS_WRITE2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2_B32
-
-    Inst_DS__DS_WRITE2_B32::~Inst_DS__DS_WRITE2_B32()
-    {
-    } // ~Inst_DS__DS_WRITE2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR_BASE + OFFSET0 * 4] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 4] = DATA2.
-    // Write 2 dwords.
-    void
-    Inst_DS__DS_WRITE2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4;
-        Addr offset1 = instData.OFFSET1 * 4;
-
-        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_WRITE2ST64_B32 class methods ---
-
-    Inst_DS__DS_WRITE2ST64_B32::Inst_DS__DS_WRITE2ST64_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2st64_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2ST64_B32
-
-    Inst_DS__DS_WRITE2ST64_B32::~Inst_DS__DS_WRITE2ST64_B32()
-    {
-    } // ~Inst_DS__DS_WRITE2ST64_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR_BASE + OFFSET0 * 4 * 64] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 4 * 64] = DATA2;
-    // Write 2 dwords.
-    void
-    Inst_DS__DS_WRITE2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 2]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4 * 64;
-        Addr offset1 = instData.OFFSET1 * 4 * 64;
-
-        initDualMemWrite<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_CMPST_B32 class methods ---
-
-    Inst_DS__DS_CMPST_B32::Inst_DS__DS_CMPST_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_b32")
-    {
-    } // Inst_DS__DS_CMPST_B32
-
-    Inst_DS__DS_CMPST_B32::~Inst_DS__DS_CMPST_B32()
-    {
-    } // ~Inst_DS__DS_CMPST_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_F32 class methods ---
-
-    Inst_DS__DS_CMPST_F32::Inst_DS__DS_CMPST_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_CMPST_F32
-
-    Inst_DS__DS_CMPST_F32::~Inst_DS__DS_CMPST_F32()
-    {
-    } // ~Inst_DS__DS_CMPST_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_F32 class methods ---
-
-    Inst_DS__DS_MIN_F32::Inst_DS__DS_MIN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_F32
-
-    Inst_DS__DS_MIN_F32::~Inst_DS__DS_MIN_F32()
-    {
-    } // ~Inst_DS__DS_MIN_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN.
-    void
-    Inst_DS__DS_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_F32 class methods ---
-
-    Inst_DS__DS_MAX_F32::Inst_DS__DS_MAX_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_F32
-
-    Inst_DS__DS_MAX_F32::~Inst_DS__DS_MAX_F32()
-    {
-    } // ~Inst_DS__DS_MAX_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX.
-    void
-    Inst_DS__DS_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_NOP class methods ---
-
-    Inst_DS__DS_NOP::Inst_DS__DS_NOP(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_nop")
-    {
-        setFlag(Nop);
-    } // Inst_DS__DS_NOP
-
-    Inst_DS__DS_NOP::~Inst_DS__DS_NOP()
-    {
-    } // ~Inst_DS__DS_NOP
-
-    // --- description from .arch file ---
-    // Do nothing.
-    void
-    Inst_DS__DS_NOP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        gpuDynInst->wavefront()->decLGKMInstsIssued();
-    } // execute
-    // --- Inst_DS__DS_ADD_F32 class methods ---
-
-    Inst_DS__DS_ADD_F32::Inst_DS__DS_ADD_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_f32")
-    {
-        setFlag(F32);
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicAdd);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_ADD_F32
-
-    Inst_DS__DS_ADD_F32::~Inst_DS__DS_ADD_F32()
-    {
-    } // ~Inst_DS__DS_ADD_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // MEM[ADDR] += DATA;
-    // Floating point add that handles NaN/INF/denormal values.
-    void
-    Inst_DS__DS_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandF32 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemF32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_ADD_F32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemF32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_ADD_F32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B8 class methods ---
-
-    Inst_DS__DS_WRITE_B8::Inst_DS__DS_WRITE_B8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B8
-
-    Inst_DS__DS_WRITE_B8::~Inst_DS__DS_WRITE_B8()
-    {
-    } // ~Inst_DS__DS_WRITE_B8
-
-    // --- description from .arch file ---
-    // MEM[ADDR] = DATA[7:0].
-    // Byte write.
-    void
-    Inst_DS__DS_WRITE_B8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B8_D16_HI class methods ---
-
-    Inst_DS__DS_WRITE_B8_D16_HI::Inst_DS__DS_WRITE_B8_D16_HI(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b8_d16_hi")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B8_D16_HI
-
-    Inst_DS__DS_WRITE_B8_D16_HI::~Inst_DS__DS_WRITE_B8_D16_HI()
-    {
-    } // ~Inst_DS__DS_WRITE_B8_D16_HI
-
-    // --- description from .arch file ---
-    // MEM[ADDR] = DATA[23:16].
-    // Byte write in to high word.
-    void
-    Inst_DS__DS_WRITE_B8_D16_HI::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = bits(data[lane], 23, 16);
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B8_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B8_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B16 class methods ---
-
-    Inst_DS__DS_WRITE_B16::Inst_DS__DS_WRITE_B16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B16
-
-    Inst_DS__DS_WRITE_B16::~Inst_DS__DS_WRITE_B16()
-    {
-    } // ~Inst_DS__DS_WRITE_B16
-
-    // --- description from .arch file ---
-    // MEM[ADDR] = DATA[15:0]
-    // Short write.
-    void
-    Inst_DS__DS_WRITE_B16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU16 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU16>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_ADD_RTN_U32 class methods ---
-
-    Inst_DS__DS_ADD_RTN_U32::Inst_DS__DS_ADD_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_u32")
-    {
-    } // Inst_DS__DS_ADD_RTN_U32
-
-    Inst_DS__DS_ADD_RTN_U32::~Inst_DS__DS_ADD_RTN_U32()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_ADD_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_RTN_U32 class methods ---
-
-    Inst_DS__DS_SUB_RTN_U32::Inst_DS__DS_SUB_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_rtn_u32")
-    {
-    } // Inst_DS__DS_SUB_RTN_U32
-
-    Inst_DS__DS_SUB_RTN_U32::~Inst_DS__DS_SUB_RTN_U32()
-    {
-    } // ~Inst_DS__DS_SUB_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_SUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_RTN_U32 class methods ---
-
-    Inst_DS__DS_RSUB_RTN_U32::Inst_DS__DS_RSUB_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_rtn_u32")
-    {
-    } // Inst_DS__DS_RSUB_RTN_U32
-
-    Inst_DS__DS_RSUB_RTN_U32::~Inst_DS__DS_RSUB_RTN_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_RTN_U32 class methods ---
-
-    Inst_DS__DS_INC_RTN_U32::Inst_DS__DS_INC_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_rtn_u32")
-    {
-    } // Inst_DS__DS_INC_RTN_U32
-
-    Inst_DS__DS_INC_RTN_U32::~Inst_DS__DS_INC_RTN_U32()
-    {
-    } // ~Inst_DS__DS_INC_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_INC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_RTN_U32 class methods ---
-
-    Inst_DS__DS_DEC_RTN_U32::Inst_DS__DS_DEC_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_rtn_u32")
-    {
-    } // Inst_DS__DS_DEC_RTN_U32
-
-    Inst_DS__DS_DEC_RTN_U32::~Inst_DS__DS_DEC_RTN_U32()
-    {
-    } // ~Inst_DS__DS_DEC_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_DEC_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_I32 class methods ---
-
-    Inst_DS__DS_MIN_RTN_I32::Inst_DS__DS_MIN_RTN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_i32")
-    {
-    } // Inst_DS__DS_MIN_RTN_I32
-
-    Inst_DS__DS_MIN_RTN_I32::~Inst_DS__DS_MIN_RTN_I32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_I32 class methods ---
-
-    Inst_DS__DS_MAX_RTN_I32::Inst_DS__DS_MAX_RTN_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_i32")
-    {
-    } // Inst_DS__DS_MAX_RTN_I32
-
-    Inst_DS__DS_MAX_RTN_I32::~Inst_DS__DS_MAX_RTN_I32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_U32 class methods ---
-
-    Inst_DS__DS_MIN_RTN_U32::Inst_DS__DS_MIN_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_u32")
-    {
-    } // Inst_DS__DS_MIN_RTN_U32
-
-    Inst_DS__DS_MIN_RTN_U32::~Inst_DS__DS_MIN_RTN_U32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_U32 class methods ---
-
-    Inst_DS__DS_MAX_RTN_U32::Inst_DS__DS_MAX_RTN_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_u32")
-    {
-    } // Inst_DS__DS_MAX_RTN_U32
-
-    Inst_DS__DS_MAX_RTN_U32::~Inst_DS__DS_MAX_RTN_U32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_RTN_B32 class methods ---
-
-    Inst_DS__DS_AND_RTN_B32::Inst_DS__DS_AND_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_rtn_b32")
-    {
-    } // Inst_DS__DS_AND_RTN_B32
-
-    Inst_DS__DS_AND_RTN_B32::~Inst_DS__DS_AND_RTN_B32()
-    {
-    } // ~Inst_DS__DS_AND_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_AND_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_RTN_B32 class methods ---
-
-    Inst_DS__DS_OR_RTN_B32::Inst_DS__DS_OR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_rtn_b32")
-    {
-    } // Inst_DS__DS_OR_RTN_B32
-
-    Inst_DS__DS_OR_RTN_B32::~Inst_DS__DS_OR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_OR_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_OR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_RTN_B32 class methods ---
-
-    Inst_DS__DS_XOR_RTN_B32::Inst_DS__DS_XOR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_rtn_b32")
-    {
-    } // Inst_DS__DS_XOR_RTN_B32
-
-    Inst_DS__DS_XOR_RTN_B32::~Inst_DS__DS_XOR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_XOR_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_XOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_RTN_B32 class methods ---
-
-    Inst_DS__DS_MSKOR_RTN_B32::Inst_DS__DS_MSKOR_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_rtn_b32")
-    {
-    } // Inst_DS__DS_MSKOR_RTN_B32
-
-    Inst_DS__DS_MSKOR_RTN_B32::~Inst_DS__DS_MSKOR_RTN_B32()
-    {
-    } // ~Inst_DS__DS_MSKOR_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRXCHG_RTN_B32::Inst_DS__DS_WRXCHG_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG_RTN_B32
-
-    Inst_DS__DS_WRXCHG_RTN_B32::~Inst_DS__DS_WRXCHG_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG_RTN_B32
-
-    // --- description from .arch file ---
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    // Write-exchange operation.
-    void
-    Inst_DS__DS_WRXCHG_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRXCHG2_RTN_B32::Inst_DS__DS_WRXCHG2_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG2_RTN_B32
-
-    Inst_DS__DS_WRXCHG2_RTN_B32::~Inst_DS__DS_WRXCHG2_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG2_RTN_B32
-
-    // --- description from .arch file ---
-    // Write-exchange 2 separate dwords.
-    void
-    Inst_DS__DS_WRXCHG2_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::Inst_DS__DS_WRXCHG2ST64_RTN_B32(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b32")
-    {
-    } // Inst_DS__DS_WRXCHG2ST64_RTN_B32
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::~Inst_DS__DS_WRXCHG2ST64_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B32
-
-    // --- description from .arch file ---
-    // Write-exchange 2 separate dwords with a stride of 64 dwords.
-    void
-    Inst_DS__DS_WRXCHG2ST64_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_B32 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_B32::Inst_DS__DS_CMPST_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_b32")
-    {
-    } // Inst_DS__DS_CMPST_RTN_B32
-
-    Inst_DS__DS_CMPST_RTN_B32::~Inst_DS__DS_CMPST_RTN_B32()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_F32 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_F32::Inst_DS__DS_CMPST_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_CMPST_RTN_F32
-
-    Inst_DS__DS_CMPST_RTN_F32::~Inst_DS__DS_CMPST_RTN_F32()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_F32 class methods ---
-
-    Inst_DS__DS_MIN_RTN_F32::Inst_DS__DS_MIN_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_RTN_F32
-
-    Inst_DS__DS_MIN_RTN_F32::~Inst_DS__DS_MIN_RTN_F32()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN.
-    void
-    Inst_DS__DS_MIN_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_F32 class methods ---
-
-    Inst_DS__DS_MAX_RTN_F32::Inst_DS__DS_MAX_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_RTN_F32
-
-    Inst_DS__DS_MAX_RTN_F32::~Inst_DS__DS_MAX_RTN_F32()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX.
-    void
-    Inst_DS__DS_MAX_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRAP_RTN_B32 class methods ---
-
-    Inst_DS__DS_WRAP_RTN_B32::Inst_DS__DS_WRAP_RTN_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrap_rtn_b32")
-    {
-    } // Inst_DS__DS_WRAP_RTN_B32
-
-    Inst_DS__DS_WRAP_RTN_B32::~Inst_DS__DS_WRAP_RTN_B32()
-    {
-    } // ~Inst_DS__DS_WRAP_RTN_B32
-
-    // --- description from .arch file ---
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? tmp - DATA : tmp + DATA2;
-    // RETURN_DATA = tmp.
-    void
-    Inst_DS__DS_WRAP_RTN_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_RTN_F32 class methods ---
-
-    Inst_DS__DS_ADD_RTN_F32::Inst_DS__DS_ADD_RTN_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_ADD_RTN_F32
-
-    Inst_DS__DS_ADD_RTN_F32::~Inst_DS__DS_ADD_RTN_F32()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    // Floating point add that handles NaN/INF/denormal values.
-    void
-    Inst_DS__DS_ADD_RTN_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_READ_B32 class methods ---
-
-    Inst_DS__DS_READ_B32::Inst_DS__DS_READ_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B32
-
-    Inst_DS__DS_READ_B32::~Inst_DS__DS_READ_B32()
-    {
-    } // ~Inst_DS__DS_READ_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA = MEM[ADDR].
-    // Dword read.
-    void
-    Inst_DS__DS_READ_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU32>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2_B32 class methods ---
-
-    Inst_DS__DS_READ2_B32::Inst_DS__DS_READ2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2_B32
-
-    Inst_DS__DS_READ2_B32::~Inst_DS__DS_READ2_B32()
-    {
-    } // ~Inst_DS__DS_READ2_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4].
-    // Read 2 dwords.
-    void
-    Inst_DS__DS_READ2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 4;
-        Addr offset1 = instData.OFFSET1 * 4;
-
-        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ2_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2ST64_B32 class methods ---
-
-    Inst_DS__DS_READ2ST64_B32::Inst_DS__DS_READ2ST64_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2st64_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2ST64_B32
-
-    Inst_DS__DS_READ2ST64_B32::~Inst_DS__DS_READ2ST64_B32()
-    {
-    } // ~Inst_DS__DS_READ2ST64_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 4 * 64];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 4 * 64].
-    // Read 2 dwords.
-    void
-    Inst_DS__DS_READ2ST64_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2ST64_B32::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = (instData.OFFSET0 * 4 * 64);
-        Addr offset1 = (instData.OFFSET1 * 4 * 64);
-
-        initDualMemRead<VecElemU32>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_READ2ST64_B32::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    }
-    // --- Inst_DS__DS_READ_I8 class methods ---
-
-    Inst_DS__DS_READ_I8::Inst_DS__DS_READ_I8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_i8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_I8
-
-    Inst_DS__DS_READ_I8::~Inst_DS__DS_READ_I8()
-    {
-    } // ~Inst_DS__DS_READ_I8
-
-    // --- description from .arch file ---
-    // RETURN_DATA = signext(MEM[ADDR][7:0]).
-    // Signed byte read.
-    void
-    Inst_DS__DS_READ_I8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_I8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemI8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_I8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)sext<8>((reinterpret_cast<VecElemI8*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ_U8 class methods ---
-
-    Inst_DS__DS_READ_U8::Inst_DS__DS_READ_U8(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_u8")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_U8
-
-    Inst_DS__DS_READ_U8::~Inst_DS__DS_READ_U8()
-    {
-    } // ~Inst_DS__DS_READ_U8
-
-    // --- description from .arch file ---
-    // RETURN_DATA = {24'h0,MEM[ADDR][7:0]}.
-    // Unsigned byte read.
-    void
-    Inst_DS__DS_READ_U8::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_U8::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU8>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_U8::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU8*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ_I16 class methods ---
-
-    Inst_DS__DS_READ_I16::Inst_DS__DS_READ_I16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_i16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_I16
-
-    Inst_DS__DS_READ_I16::~Inst_DS__DS_READ_I16()
-    {
-    } // ~Inst_DS__DS_READ_I16
-
-    // --- description from .arch file ---
-    // RETURN_DATA = signext(MEM[ADDR][15:0]).
-    // Signed short read.
-    void
-    Inst_DS__DS_READ_I16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_READ_U16 class methods ---
-
-    Inst_DS__DS_READ_U16::Inst_DS__DS_READ_U16(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_u16")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_U16
-
-    Inst_DS__DS_READ_U16::~Inst_DS__DS_READ_U16()
-    {
-    } // ~Inst_DS__DS_READ_U16
-
-    // --- description from .arch file ---
-    // RETURN_DATA = {16'h0,MEM[ADDR][15:0]}.
-    // Unsigned short read.
-    void
-    Inst_DS__DS_READ_U16::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-    void
-    Inst_DS__DS_READ_U16::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU16>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_U16::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)(reinterpret_cast<VecElemU16*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_SWIZZLE_B32 class methods ---
-
-    Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_swizzle_b32")
-    {
-        /**
-         * While this operation doesn't actually use DS storage we classify
-         * it as a load here because it does a writeback to a VGPR, which
-         * fits in better with the LDS pipeline logic.
-         */
-         setFlag(Load);
-    } // Inst_DS__DS_SWIZZLE_B32
-
-    Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
-    {
-    } // ~Inst_DS__DS_SWIZZLE_B32
-
-    // --- description from .arch file ---
-    // RETURN_DATA = swizzle(vgpr_data, offset1:offset0).
-    // Dword swizzle, no data is written to LDS memory; See ds_opcodes.docx for
-    // ---  details.
-    void
-    Inst_DS__DS_SWIZZLE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        wf->decLGKMInstsIssued();
-
-        if (gpuDynInst->exec_mask.none()) {
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-        /**
-         * The "DS pattern" is comprised of both offset fields. That is, the
-         * swizzle pattern between lanes. Bit 15 of the DS pattern dictates
-         * which swizzle mode to use. There are two different swizzle
-         * patterns: 1) QDMode and 2) Bit-masks mode. If bit 15 is set use
-         * QDMode else use Bit-masks mode. The remaining bits dictate how to
-         * swizzle the lanes.
-         *
-         * QDMode:      Chunks the lanes into 4s and swizzles among them.
-         *              Bits 7:6 dictate where lane 3 (of the current chunk)
-         *              gets its date, 5:4 lane 2, etc.
-         *
-         * Bit-mask:    This mode breaks bits 14:0 into 3 equal-sized chunks.
-         *              14:10 is the xor_mask, 9:5 is the or_mask, and 4:0
-         *              is the and_mask. Each lane is swizzled by performing
-         *              the appropriate operation using these masks.
-         */
-        VecElemU16 ds_pattern = ((instData.OFFSET1 << 8) | instData.OFFSET0);
-
-        data.read();
-
-        if (bits(ds_pattern, 15)) {
-            // QDMode
-            for (int lane = 0; lane < NumVecElemPerVecReg; lane += 4) {
-                /**
-                 * This operation allows data sharing between groups
-                 * of four consecutive threads. Note the increment by
-                 * 4 in the for loop.
-                 */
-                if (gpuDynInst->exec_mask[lane]) {
-                    int index0 = lane + bits(ds_pattern, 1, 0);
-                    panic_if(index0 >= NumVecElemPerVecReg, "%s: index0 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index0);
-                    vdst[lane]
-                        = gpuDynInst->exec_mask[index0] ? data[index0]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 1]) {
-                    int index1 = lane + bits(ds_pattern, 3, 2);
-                    panic_if(index1 >= NumVecElemPerVecReg, "%s: index1 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index1);
-                    vdst[lane + 1]
-                        = gpuDynInst->exec_mask[index1] ? data[index1]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 2]) {
-                    int index2 = lane + bits(ds_pattern, 5, 4);
-                    panic_if(index2 >= NumVecElemPerVecReg, "%s: index2 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index2);
-                    vdst[lane + 2]
-                        = gpuDynInst->exec_mask[index2] ? data[index2]: 0;
-                }
-                if (gpuDynInst->exec_mask[lane + 3]) {
-                    int index3 = lane + bits(ds_pattern, 7, 6);
-                    panic_if(index3 >= NumVecElemPerVecReg, "%s: index3 (%d) "
-                             "is out of bounds.\n", gpuDynInst->disassemble(),
-                             index3);
-                    vdst[lane + 3]
-                        = gpuDynInst->exec_mask[index3] ? data[index3]: 0;
-                }
-            }
-        } else {
-            // Bit Mode
-            int and_mask = bits(ds_pattern, 4, 0);
-            int or_mask = bits(ds_pattern, 9, 5);
-            int xor_mask = bits(ds_pattern, 14, 10);
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    int index = (((lane & and_mask) | or_mask) ^ xor_mask);
-                    // Adjust for the next 32 lanes.
-                    if (lane > 31) {
-                        index += 32;
-                    }
-                    panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is "
-                             "out of bounds.\n", gpuDynInst->disassemble(),
-                             index);
-                    vdst[lane]
-                        = gpuDynInst->exec_mask[index] ? data[index] : 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a  possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-        /**
-         * Similarly, this counter could build up over time, even across
-         * multiple wavefronts, and cause a deadlock.
-         */
-        wf->rdLmReqsInPipe--;
-    } // execute
-    // --- Inst_DS__DS_PERMUTE_B32 class methods ---
-
-    Inst_DS__DS_PERMUTE_B32::Inst_DS__DS_PERMUTE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_permute_b32")
-    {
-        setFlag(MemoryRef);
-        /**
-         * While this operation doesn't actually use DS storage we classify
-         * it as a load here because it does a writeback to a VGPR, which
-         * fits in better with the LDS pipeline logic.
-         */
-         setFlag(Load);
-    } // Inst_DS__DS_PERMUTE_B32
-
-    Inst_DS__DS_PERMUTE_B32::~Inst_DS__DS_PERMUTE_B32()
-    {
-    } // ~Inst_DS__DS_PERMUTE_B32
-
-    // --- description from .arch file ---
-    // Forward permute.
-    void
-    Inst_DS__DS_PERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        wf->decLGKMInstsIssued();
-
-        if (gpuDynInst->exec_mask.none()) {
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        addr.read();
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                /**
-                 * One of the offset fields can be used for the index.
-                 * It is assumed OFFSET0 would be used, as OFFSET1 is
-                 * typically only used for DS ops that operate on two
-                 * disparate pieces of data.
-                 */
-                assert(!instData.OFFSET1);
-                /**
-                 * The address provided is a byte address, but VGPRs are
-                 * 4 bytes, so we must divide by 4 to get the actual VGPR
-                 * index. Additionally, the index is calculated modulo the
-                 * WF size, 64 in this case, so we simply extract bits 7-2.
-                 */
-                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
-                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
-                         "of bounds.\n", gpuDynInst->disassemble(), index);
-                /**
-                 * If the shuffled index corresponds to a lane that is
-                 * inactive then this instruction writes a 0 to the active
-                 * lane in VDST.
-                 */
-                if (wf->execMask(index)) {
-                    vdst[index] = data[lane];
-                } else {
-                    vdst[index] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a  possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-        /**
-         * Similarly, this counter could build up over time, even across
-         * multiple wavefronts, and cause a deadlock.
-         */
-        wf->rdLmReqsInPipe--;
-    } // execute
-    // --- Inst_DS__DS_BPERMUTE_B32 class methods ---
-
-    Inst_DS__DS_BPERMUTE_B32::Inst_DS__DS_BPERMUTE_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_bpermute_b32")
-    {
-        setFlag(MemoryRef);
-        /**
-         * While this operation doesn't actually use DS storage we classify
-         * it as a load here because it does a writeback to a VGPR, which
-         * fits in better with the LDS pipeline logic.
-         */
-        setFlag(Load);
-    } // Inst_DS__DS_BPERMUTE_B32
-
-    Inst_DS__DS_BPERMUTE_B32::~Inst_DS__DS_BPERMUTE_B32()
-    {
-    } // ~Inst_DS__DS_BPERMUTE_B32
-
-    // --- description from .arch file ---
-    // Backward permute.
-    void
-    Inst_DS__DS_BPERMUTE_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        wf->decLGKMInstsIssued();
-
-        if (gpuDynInst->exec_mask.none()) {
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()
-                                ->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA0);
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        addr.read();
-        data.read();
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                /**
-                 * One of the offset fields can be used for the index.
-                 * It is assumed OFFSET0 would be used, as OFFSET1 is
-                 * typically only used for DS ops that operate on two
-                 * disparate pieces of data.
-                 */
-                assert(!instData.OFFSET1);
-                /**
-                 * The address provided is a byte address, but VGPRs are
-                 * 4 bytes, so we must divide by 4 to get the actual VGPR
-                 * index. Additionally, the index is calculated modulo the
-                 * WF size, 64 in this case, so we simply extract bits 7-2.
-                 */
-                int index = bits(addr[lane] + instData.OFFSET0, 7, 2);
-                panic_if(index >= NumVecElemPerVecReg, "%s: index (%d) is out "
-                         "of bounds.\n", gpuDynInst->disassemble(), index);
-                /**
-                 * If the shuffled index corresponds to a lane that is
-                 * inactive then this instruction writes a 0 to the active
-                 * lane in VDST.
-                 */
-                if (wf->execMask(index)) {
-                    vdst[lane] = data[index];
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-
-        /**
-         * This is needed because we treat this instruction as a load
-         * but it's not an actual memory request.
-         * Without this, the destination register never gets marked as
-         * free, leading to a  possible deadlock
-         */
-        wf->computeUnit->vrf[wf->simdId]->
-            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
-        /**
-         * Similarly, this counter could build up over time, even across
-         * multiple wavefronts, and cause a deadlock.
-         */
-        wf->rdLmReqsInPipe--;
-    } // execute
-
-    // --- Inst_DS__DS_ADD_U64 class methods ---
-
-    Inst_DS__DS_ADD_U64::Inst_DS__DS_ADD_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_u64")
-    {
-        setFlag(MemoryRef);
-        setFlag(GroupSegment);
-        setFlag(AtomicAdd);
-        setFlag(AtomicNoReturn);
-    } // Inst_DS__DS_ADD_U64
-
-    Inst_DS__DS_ADD_U64::~Inst_DS__DS_ADD_U64()
-    {
-    } // ~Inst_DS__DS_ADD_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR] += DATA[0:1];
-    void
-    Inst_DS__DS_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_ADD_U64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initAtomicAccess<VecElemU64>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_ADD_U64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_SUB_U64 class methods ---
-
-    Inst_DS__DS_SUB_U64::Inst_DS__DS_SUB_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_u64")
-    {
-    } // Inst_DS__DS_SUB_U64
-
-    Inst_DS__DS_SUB_U64::~Inst_DS__DS_SUB_U64()
-    {
-    } // ~Inst_DS__DS_SUB_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_SUB_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_U64 class methods ---
-
-    Inst_DS__DS_RSUB_U64::Inst_DS__DS_RSUB_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_u64")
-    {
-    } // Inst_DS__DS_RSUB_U64
-
-    Inst_DS__DS_RSUB_U64::~Inst_DS__DS_RSUB_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_U64 class methods ---
-
-    Inst_DS__DS_INC_U64::Inst_DS__DS_INC_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_u64")
-    {
-    } // Inst_DS__DS_INC_U64
-
-    Inst_DS__DS_INC_U64::~Inst_DS__DS_INC_U64()
-    {
-    } // ~Inst_DS__DS_INC_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_INC_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_U64 class methods ---
-
-    Inst_DS__DS_DEC_U64::Inst_DS__DS_DEC_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_u64")
-    {
-    } // Inst_DS__DS_DEC_U64
-
-    Inst_DS__DS_DEC_U64::~Inst_DS__DS_DEC_U64()
-    {
-    } // ~Inst_DS__DS_DEC_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_DEC_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_I64 class methods ---
-
-    Inst_DS__DS_MIN_I64::Inst_DS__DS_MIN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_i64")
-    {
-    } // Inst_DS__DS_MIN_I64
-
-    Inst_DS__DS_MIN_I64::~Inst_DS__DS_MIN_I64()
-    {
-    } // ~Inst_DS__DS_MIN_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_I64 class methods ---
-
-    Inst_DS__DS_MAX_I64::Inst_DS__DS_MAX_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_i64")
-    {
-    } // Inst_DS__DS_MAX_I64
-
-    Inst_DS__DS_MAX_I64::~Inst_DS__DS_MAX_I64()
-    {
-    } // ~Inst_DS__DS_MAX_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_U64 class methods ---
-
-    Inst_DS__DS_MIN_U64::Inst_DS__DS_MIN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_u64")
-    {
-    } // Inst_DS__DS_MIN_U64
-
-    Inst_DS__DS_MIN_U64::~Inst_DS__DS_MIN_U64()
-    {
-    } // ~Inst_DS__DS_MIN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_U64 class methods ---
-
-    Inst_DS__DS_MAX_U64::Inst_DS__DS_MAX_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_u64")
-    {
-    } // Inst_DS__DS_MAX_U64
-
-    Inst_DS__DS_MAX_U64::~Inst_DS__DS_MAX_U64()
-    {
-    } // ~Inst_DS__DS_MAX_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_B64 class methods ---
-
-    Inst_DS__DS_AND_B64::Inst_DS__DS_AND_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_b64")
-    {
-    } // Inst_DS__DS_AND_B64
-
-    Inst_DS__DS_AND_B64::~Inst_DS__DS_AND_B64()
-    {
-    } // ~Inst_DS__DS_AND_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_AND_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_B64 class methods ---
-
-    Inst_DS__DS_OR_B64::Inst_DS__DS_OR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_b64")
-    {
-    } // Inst_DS__DS_OR_B64
-
-    Inst_DS__DS_OR_B64::~Inst_DS__DS_OR_B64()
-    {
-    } // ~Inst_DS__DS_OR_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_OR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_B64 class methods ---
-
-    Inst_DS__DS_XOR_B64::Inst_DS__DS_XOR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_b64")
-    {
-    } // Inst_DS__DS_XOR_B64
-
-    Inst_DS__DS_XOR_B64::~Inst_DS__DS_XOR_B64()
-    {
-    } // ~Inst_DS__DS_XOR_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_B64 class methods ---
-
-    Inst_DS__DS_MSKOR_B64::Inst_DS__DS_MSKOR_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_b64")
-    {
-    } // Inst_DS__DS_MSKOR_B64
-
-    Inst_DS__DS_MSKOR_B64::~Inst_DS__DS_MSKOR_B64()
-    {
-    } // ~Inst_DS__DS_MSKOR_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_B64 class methods ---
-
-    Inst_DS__DS_WRITE_B64::Inst_DS__DS_WRITE_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B64
-
-    Inst_DS__DS_WRITE_B64::~Inst_DS__DS_WRITE_B64()
-    {
-    } // ~Inst_DS__DS_WRITE_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR] = DATA.
-    // Write qword.
-    void
-    Inst_DS__DS_WRITE_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA0);
-
-        addr.read();
-        data.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<VecElemU64>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE2_B64 class methods ---
-
-    Inst_DS__DS_WRITE2_B64::Inst_DS__DS_WRITE2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2_B64
-
-    Inst_DS__DS_WRITE2_B64::~Inst_DS__DS_WRITE2_B64()
-    {
-    } // ~Inst_DS__DS_WRITE2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR_BASE + OFFSET0 * 8] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 8] = DATA2.
-    // Write 2 qwords.
-    void
-    Inst_DS__DS_WRITE2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2] = data0[lane];
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 8;
-        Addr offset1 = instData.OFFSET1 * 8;
-
-        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_WRITE2ST64_B64 class methods ---
-
-    Inst_DS__DS_WRITE2ST64_B64::Inst_DS__DS_WRITE2ST64_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write2st64_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE2ST64_B64
-
-    Inst_DS__DS_WRITE2ST64_B64::~Inst_DS__DS_WRITE2ST64_B64()
-    {
-    } // ~Inst_DS__DS_WRITE2ST64_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // MEM[ADDR_BASE + OFFSET0 * 8 * 64] = DATA;
-    // MEM[ADDR_BASE + OFFSET1 * 8 * 64] = DATA2;
-    // Write 2 qwords.
-    void
-    Inst_DS__DS_WRITE2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU64 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU64 data1(gpuDynInst, extData.DATA1);
-
-        addr.read();
-        data0.read();
-        data1.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2] = data0[lane];
-                (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1] = data1[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 8 * 64;
-        Addr offset1 = instData.OFFSET1 * 8 * 64;
-
-        initDualMemWrite<VecElemU64>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_WRITE2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    }
-    // --- Inst_DS__DS_CMPST_B64 class methods ---
-
-    Inst_DS__DS_CMPST_B64::Inst_DS__DS_CMPST_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_b64")
-    {
-    } // Inst_DS__DS_CMPST_B64
-
-    Inst_DS__DS_CMPST_B64::~Inst_DS__DS_CMPST_B64()
-    {
-    } // ~Inst_DS__DS_CMPST_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_F64 class methods ---
-
-    Inst_DS__DS_CMPST_F64::Inst_DS__DS_CMPST_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_CMPST_F64
-
-    Inst_DS__DS_CMPST_F64::~Inst_DS__DS_CMPST_F64()
-    {
-    } // ~Inst_DS__DS_CMPST_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_F64 class methods ---
-
-    Inst_DS__DS_MIN_F64::Inst_DS__DS_MIN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_F64
-
-    Inst_DS__DS_MIN_F64::~Inst_DS__DS_MIN_F64()
-    {
-    } // ~Inst_DS__DS_MIN_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN_X2.
-    void
-    Inst_DS__DS_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_F64 class methods ---
-
-    Inst_DS__DS_MAX_F64::Inst_DS__DS_MAX_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_F64
-
-    Inst_DS__DS_MAX_F64::~Inst_DS__DS_MAX_F64()
-    {
-    } // ~Inst_DS__DS_MAX_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX_X2.
-    void
-    Inst_DS__DS_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_RTN_U64 class methods ---
-
-    Inst_DS__DS_ADD_RTN_U64::Inst_DS__DS_ADD_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_rtn_u64")
-    {
-    } // Inst_DS__DS_ADD_RTN_U64
-
-    Inst_DS__DS_ADD_RTN_U64::~Inst_DS__DS_ADD_RTN_U64()
-    {
-    } // ~Inst_DS__DS_ADD_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_ADD_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_RTN_U64 class methods ---
-
-    Inst_DS__DS_SUB_RTN_U64::Inst_DS__DS_SUB_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_rtn_u64")
-    {
-    } // Inst_DS__DS_SUB_RTN_U64
-
-    Inst_DS__DS_SUB_RTN_U64::~Inst_DS__DS_SUB_RTN_U64()
-    {
-    } // ~Inst_DS__DS_SUB_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_SUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_RTN_U64 class methods ---
-
-    Inst_DS__DS_RSUB_RTN_U64::Inst_DS__DS_RSUB_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_rtn_u64")
-    {
-    } // Inst_DS__DS_RSUB_RTN_U64
-
-    Inst_DS__DS_RSUB_RTN_U64::~Inst_DS__DS_RSUB_RTN_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA - MEM[ADDR];
-    // RETURN_DATA = tmp.
-    // Subtraction with reversed operands.
-    void
-    Inst_DS__DS_RSUB_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_RTN_U64 class methods ---
-
-    Inst_DS__DS_INC_RTN_U64::Inst_DS__DS_INC_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_rtn_u64")
-    {
-    } // Inst_DS__DS_INC_RTN_U64
-
-    Inst_DS__DS_INC_RTN_U64::~Inst_DS__DS_INC_RTN_U64()
-    {
-    } // ~Inst_DS__DS_INC_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_INC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_RTN_U64 class methods ---
-
-    Inst_DS__DS_DEC_RTN_U64::Inst_DS__DS_DEC_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_rtn_u64")
-    {
-    } // Inst_DS__DS_DEC_RTN_U64
-
-    Inst_DS__DS_DEC_RTN_U64::~Inst_DS__DS_DEC_RTN_U64()
-    {
-    } // ~Inst_DS__DS_DEC_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_DEC_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_I64 class methods ---
-
-    Inst_DS__DS_MIN_RTN_I64::Inst_DS__DS_MIN_RTN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_i64")
-    {
-    } // Inst_DS__DS_MIN_RTN_I64
-
-    Inst_DS__DS_MIN_RTN_I64::~Inst_DS__DS_MIN_RTN_I64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_I64 class methods ---
-
-    Inst_DS__DS_MAX_RTN_I64::Inst_DS__DS_MAX_RTN_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_i64")
-    {
-    } // Inst_DS__DS_MAX_RTN_I64
-
-    Inst_DS__DS_MAX_RTN_I64::~Inst_DS__DS_MAX_RTN_I64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_U64 class methods ---
-
-    Inst_DS__DS_MIN_RTN_U64::Inst_DS__DS_MIN_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_u64")
-    {
-    } // Inst_DS__DS_MIN_RTN_U64
-
-    Inst_DS__DS_MIN_RTN_U64::~Inst_DS__DS_MIN_RTN_U64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MIN_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_U64 class methods ---
-
-    Inst_DS__DS_MAX_RTN_U64::Inst_DS__DS_MAX_RTN_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_u64")
-    {
-    } // Inst_DS__DS_MAX_RTN_U64
-
-    Inst_DS__DS_MAX_RTN_U64::~Inst_DS__DS_MAX_RTN_U64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_MAX_RTN_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_RTN_B64 class methods ---
-
-    Inst_DS__DS_AND_RTN_B64::Inst_DS__DS_AND_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_rtn_b64")
-    {
-    } // Inst_DS__DS_AND_RTN_B64
-
-    Inst_DS__DS_AND_RTN_B64::~Inst_DS__DS_AND_RTN_B64()
-    {
-    } // ~Inst_DS__DS_AND_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_AND_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_RTN_B64 class methods ---
-
-    Inst_DS__DS_OR_RTN_B64::Inst_DS__DS_OR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_rtn_b64")
-    {
-    } // Inst_DS__DS_OR_RTN_B64
-
-    Inst_DS__DS_OR_RTN_B64::~Inst_DS__DS_OR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_OR_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_OR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_RTN_B64 class methods ---
-
-    Inst_DS__DS_XOR_RTN_B64::Inst_DS__DS_XOR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_rtn_b64")
-    {
-    } // Inst_DS__DS_XOR_RTN_B64
-
-    Inst_DS__DS_XOR_RTN_B64::~Inst_DS__DS_XOR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_XOR_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_DS__DS_XOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MSKOR_RTN_B64 class methods ---
-
-    Inst_DS__DS_MSKOR_RTN_B64::Inst_DS__DS_MSKOR_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_mskor_rtn_b64")
-    {
-    } // Inst_DS__DS_MSKOR_RTN_B64
-
-    Inst_DS__DS_MSKOR_RTN_B64::~Inst_DS__DS_MSKOR_RTN_B64()
-    {
-    } // ~Inst_DS__DS_MSKOR_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (MEM_ADDR[ADDR] & ~DATA) | DATA2;
-    // RETURN_DATA = tmp.
-    // Masked dword OR, D0 contains the mask and D1 contains the new value.
-    void
-    Inst_DS__DS_MSKOR_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG_RTN_B64 class methods ---
-
-    Inst_DS__DS_WRXCHG_RTN_B64::Inst_DS__DS_WRXCHG_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG_RTN_B64
-
-    Inst_DS__DS_WRXCHG_RTN_B64::~Inst_DS__DS_WRXCHG_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG_RTN_B64
-
-    // --- description from .arch file ---
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    // Write-exchange operation.
-    void
-    Inst_DS__DS_WRXCHG_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2_RTN_B64 class methods ---
-
-    Inst_DS__DS_WRXCHG2_RTN_B64::Inst_DS__DS_WRXCHG2_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG2_RTN_B64
-
-    Inst_DS__DS_WRXCHG2_RTN_B64::~Inst_DS__DS_WRXCHG2_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG2_RTN_B64
-
-    // --- description from .arch file ---
-    // Write-exchange 2 separate qwords.
-    void
-    Inst_DS__DS_WRXCHG2_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRXCHG2ST64_RTN_B64 class methods ---
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::Inst_DS__DS_WRXCHG2ST64_RTN_B64(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_wrxchg2st64_rtn_b64")
-    {
-    } // Inst_DS__DS_WRXCHG2ST64_RTN_B64
-
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::~Inst_DS__DS_WRXCHG2ST64_RTN_B64()
-    {
-    } // ~Inst_DS__DS_WRXCHG2ST64_RTN_B64
-
-    // --- description from .arch file ---
-    // Write-exchange 2 qwords with a stride of 64 qwords.
-    void
-    Inst_DS__DS_WRXCHG2ST64_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_B64 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_B64::Inst_DS__DS_CMPST_RTN_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_b64")
-    {
-    } // Inst_DS__DS_CMPST_RTN_B64
-
-    Inst_DS__DS_CMPST_RTN_B64::~Inst_DS__DS_CMPST_RTN_B64()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Compare and store.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_CMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CMPST_RTN_F64 class methods ---
-
-    Inst_DS__DS_CMPST_RTN_F64::Inst_DS__DS_CMPST_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_cmpst_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_CMPST_RTN_F64
-
-    Inst_DS__DS_CMPST_RTN_F64::~Inst_DS__DS_CMPST_RTN_F64()
-    {
-    } // ~Inst_DS__DS_CMPST_RTN_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA2;
-    // cmp = DATA;
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    // Floating point compare and store that handles NaN/INF/denormal values.
-    // Caution, the order of src and cmp are the *opposite* of the
-    // ---  BUFFER_ATOMIC_FCMPSWAP_X2 opcode.
-    void
-    Inst_DS__DS_CMPST_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_RTN_F64 class methods ---
-
-    Inst_DS__DS_MIN_RTN_F64::Inst_DS__DS_MIN_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_RTN_F64
-
-    Inst_DS__DS_MIN_RTN_F64::~Inst_DS__DS_MIN_RTN_F64()
-    {
-    } // ~Inst_DS__DS_MIN_RTN_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (cmp < tmp) ? src : tmp.
-    // Floating point minimum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMIN_X2.
-    void
-    Inst_DS__DS_MIN_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_RTN_F64 class methods ---
-
-    Inst_DS__DS_MAX_RTN_F64::Inst_DS__DS_MAX_RTN_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_rtn_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_RTN_F64
-
-    Inst_DS__DS_MAX_RTN_F64::~Inst_DS__DS_MAX_RTN_F64()
-    {
-    } // ~Inst_DS__DS_MAX_RTN_F64
-
-    // --- description from .arch file ---
-    // 64b.
-    // tmp = MEM[ADDR];
-    // src = DATA;
-    // cmp = DATA2;
-    // MEM[ADDR] = (tmp > cmp) ? src : tmp.
-    // Floating point maximum that handles NaN/INF/denormal values.
-    // Note that this opcode is slightly more general-purpose than
-    // ---  BUFFER_ATOMIC_FMAX_X2.
-    void
-    Inst_DS__DS_MAX_RTN_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_READ_B64 class methods ---
-
-    Inst_DS__DS_READ_B64::Inst_DS__DS_READ_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B64
-
-    Inst_DS__DS_READ_B64::~Inst_DS__DS_READ_B64()
-    {
-    } // ~Inst_DS__DS_READ_B64
-
-    // --- description from .arch file ---
-    // RETURN_DATA = MEM[ADDR].
-    // Read 1 qword.
-    void
-    Inst_DS__DS_READ_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<VecElemU64>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2_B64 class methods ---
-
-    Inst_DS__DS_READ2_B64::Inst_DS__DS_READ2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2_B64
-
-    Inst_DS__DS_READ2_B64::~Inst_DS__DS_READ2_B64()
-    {
-    } // ~Inst_DS__DS_READ2_B64
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8].
-    // Read 2 qwords.
-    void
-    Inst_DS__DS_READ2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0 * 8;
-        Addr offset1 = instData.OFFSET1 * 8;
-
-        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ2_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-    // --- Inst_DS__DS_READ2ST64_B64 class methods ---
-
-    Inst_DS__DS_READ2ST64_B64::Inst_DS__DS_READ2ST64_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read2st64_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ2ST64_B64
-
-    Inst_DS__DS_READ2ST64_B64::~Inst_DS__DS_READ2ST64_B64()
-    {
-    } // ~Inst_DS__DS_READ2ST64_B64
-
-    // --- description from .arch file ---
-    // RETURN_DATA[0] = MEM[ADDR_BASE + OFFSET0 * 8 * 64];
-    // RETURN_DATA[1] = MEM[ADDR_BASE + OFFSET1 * 8 * 64].
-    // Read 2 qwords.
-    void
-    Inst_DS__DS_READ2ST64_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ2ST64_B64::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = (instData.OFFSET0 * 8 * 64);
-        Addr offset1 = (instData.OFFSET1 * 8 * 64);
-
-        initDualMemRead<VecElemU64>(gpuDynInst, offset0, offset1);
-    }
-
-    void
-    Inst_DS__DS_READ2ST64_B64::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU64 vdst1(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2];
-                vdst1[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane * 2 + 1];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    }
-    // --- Inst_DS__DS_CONDXCHG32_RTN_B64 class methods ---
-
-    Inst_DS__DS_CONDXCHG32_RTN_B64::Inst_DS__DS_CONDXCHG32_RTN_B64(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_condxchg32_rtn_b64")
-    {
-    } // Inst_DS__DS_CONDXCHG32_RTN_B64
-
-    Inst_DS__DS_CONDXCHG32_RTN_B64::~Inst_DS__DS_CONDXCHG32_RTN_B64()
-    {
-    } // ~Inst_DS__DS_CONDXCHG32_RTN_B64
-
-    // --- description from .arch file ---
-    // Conditional write exchange.
-    void
-    Inst_DS__DS_CONDXCHG32_RTN_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_SRC2_U32 class methods ---
-
-    Inst_DS__DS_ADD_SRC2_U32::Inst_DS__DS_ADD_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_u32")
-    {
-    } // Inst_DS__DS_ADD_SRC2_U32
-
-    Inst_DS__DS_ADD_SRC2_U32::~Inst_DS__DS_ADD_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] + MEM[B].
-    void
-    Inst_DS__DS_ADD_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_SRC2_U32 class methods ---
-
-    Inst_DS__DS_SUB_SRC2_U32::Inst_DS__DS_SUB_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_src2_u32")
-    {
-    } // Inst_DS__DS_SUB_SRC2_U32
-
-    Inst_DS__DS_SUB_SRC2_U32::~Inst_DS__DS_SUB_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_SUB_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] - MEM[B].
-    void
-    Inst_DS__DS_SUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_SRC2_U32 class methods ---
-
-    Inst_DS__DS_RSUB_SRC2_U32::Inst_DS__DS_RSUB_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_src2_u32")
-    {
-    } // Inst_DS__DS_RSUB_SRC2_U32
-
-    Inst_DS__DS_RSUB_SRC2_U32::~Inst_DS__DS_RSUB_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_RSUB_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] - MEM[A].
-    void
-    Inst_DS__DS_RSUB_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_SRC2_U32 class methods ---
-
-    Inst_DS__DS_INC_SRC2_U32::Inst_DS__DS_INC_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_src2_u32")
-    {
-    } // Inst_DS__DS_INC_SRC2_U32
-
-    Inst_DS__DS_INC_SRC2_U32::~Inst_DS__DS_INC_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_INC_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
-    void
-    Inst_DS__DS_INC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_SRC2_U32 class methods ---
-
-    Inst_DS__DS_DEC_SRC2_U32::Inst_DS__DS_DEC_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_src2_u32")
-    {
-    } // Inst_DS__DS_DEC_SRC2_U32
-
-    Inst_DS__DS_DEC_SRC2_U32::~Inst_DS__DS_DEC_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_DEC_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
-    // Uint decrement.
-    void
-    Inst_DS__DS_DEC_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_I32 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_I32::Inst_DS__DS_MIN_SRC2_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_i32")
-    {
-    } // Inst_DS__DS_MIN_SRC2_I32
-
-    Inst_DS__DS_MIN_SRC2_I32::~Inst_DS__DS_MIN_SRC2_I32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_I32 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_I32::Inst_DS__DS_MAX_SRC2_I32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_i32")
-    {
-    } // Inst_DS__DS_MAX_SRC2_I32
-
-    Inst_DS__DS_MAX_SRC2_I32::~Inst_DS__DS_MAX_SRC2_I32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_I32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_I32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_U32 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_U32::Inst_DS__DS_MIN_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_u32")
-    {
-    } // Inst_DS__DS_MIN_SRC2_U32
-
-    Inst_DS__DS_MIN_SRC2_U32::~Inst_DS__DS_MIN_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_U32 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_U32::Inst_DS__DS_MAX_SRC2_U32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_u32")
-    {
-    } // Inst_DS__DS_MAX_SRC2_U32
-
-    Inst_DS__DS_MAX_SRC2_U32::~Inst_DS__DS_MAX_SRC2_U32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_U32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_U32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_SRC2_B32 class methods ---
-
-    Inst_DS__DS_AND_SRC2_B32::Inst_DS__DS_AND_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_src2_b32")
-    {
-    } // Inst_DS__DS_AND_SRC2_B32
-
-    Inst_DS__DS_AND_SRC2_B32::~Inst_DS__DS_AND_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_AND_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] & MEM[B].
-    void
-    Inst_DS__DS_AND_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_SRC2_B32 class methods ---
-
-    Inst_DS__DS_OR_SRC2_B32::Inst_DS__DS_OR_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_src2_b32")
-    {
-    } // Inst_DS__DS_OR_SRC2_B32
-
-    Inst_DS__DS_OR_SRC2_B32::~Inst_DS__DS_OR_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_OR_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] | MEM[B].
-    void
-    Inst_DS__DS_OR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_SRC2_B32 class methods ---
-
-    Inst_DS__DS_XOR_SRC2_B32::Inst_DS__DS_XOR_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_src2_b32")
-    {
-    } // Inst_DS__DS_XOR_SRC2_B32
-
-    Inst_DS__DS_XOR_SRC2_B32::~Inst_DS__DS_XOR_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_XOR_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] ^ MEM[B].
-    void
-    Inst_DS__DS_XOR_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_SRC2_B32 class methods ---
-
-    Inst_DS__DS_WRITE_SRC2_B32::Inst_DS__DS_WRITE_SRC2_B32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_src2_b32")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_SRC2_B32
-
-    Inst_DS__DS_WRITE_SRC2_B32::~Inst_DS__DS_WRITE_SRC2_B32()
-    {
-    } // ~Inst_DS__DS_WRITE_SRC2_B32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B].
-    // Write dword.
-    void
-    Inst_DS__DS_WRITE_SRC2_B32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_F32 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_F32::Inst_DS__DS_MIN_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MIN_SRC2_F32
-
-    Inst_DS__DS_MIN_SRC2_F32::~Inst_DS__DS_MIN_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MIN_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_F32 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_F32::Inst_DS__DS_MAX_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_MAX_SRC2_F32
-
-    Inst_DS__DS_MAX_SRC2_F32::~Inst_DS__DS_MAX_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MAX_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_SRC2_F32 class methods ---
-
-    Inst_DS__DS_ADD_SRC2_F32::Inst_DS__DS_ADD_SRC2_F32(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_f32")
-    {
-        setFlag(F32);
-    } // Inst_DS__DS_ADD_SRC2_F32
-
-    Inst_DS__DS_ADD_SRC2_F32::~Inst_DS__DS_ADD_SRC2_F32()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_F32
-
-    // --- description from .arch file ---
-    // 32b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] + MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_ADD_SRC2_F32::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_RELEASE_ALL class methods ---
-
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::Inst_DS__DS_GWS_SEMA_RELEASE_ALL(
-          InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_release_all")
-    {
-    } // Inst_DS__DS_GWS_SEMA_RELEASE_ALL
-
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::~Inst_DS__DS_GWS_SEMA_RELEASE_ALL()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_RELEASE_ALL
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource (rid) indicated will process this opcode by
-    // updating the counter and labeling the specified resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // //Incr the state counter of the resource
-    // state.counter[rid] = state.wave_in_queue;
-    // state.type = SEMAPHORE;
-    // return rd_done; //release calling wave
-    // This action will release ALL queued waves; it Will have no effect if no
-    // ---  waves are present.
-    void
-    Inst_DS__DS_GWS_SEMA_RELEASE_ALL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_INIT class methods ---
-
-    Inst_DS__DS_GWS_INIT::Inst_DS__DS_GWS_INIT(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_init")
-    {
-    } // Inst_DS__DS_GWS_INIT
-
-    Inst_DS__DS_GWS_INIT::~Inst_DS__DS_GWS_INIT()
-    {
-    } // ~Inst_DS__DS_GWS_INIT
-
-    // --- description from .arch file ---
-    // GDS Only: Initialize a barrier or semaphore resource.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // //Get the value to use in init
-    // index = find_first_valid(vector mask)
-    // value = DATA[thread: index]
-    // //Set the state of the resource
-    // state.counter[rid] = lsb(value); //limit #waves
-    // state.flag[rid] = 0;
-    // return rd_done; //release calling wave
-    void
-    Inst_DS__DS_GWS_INIT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_V class methods ---
-
-    Inst_DS__DS_GWS_SEMA_V::Inst_DS__DS_GWS_SEMA_V(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_v")
-    {
-    } // Inst_DS__DS_GWS_SEMA_V
-
-    Inst_DS__DS_GWS_SEMA_V::~Inst_DS__DS_GWS_SEMA_V()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_V
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // updating the counter and labeling the resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // //Incr the state counter of the resource
-    // state.counter[rid]++;
-    // state.type = SEMAPHORE;
-    // return rd_done; //release calling wave
-    // This action will release one waved if any are queued in this resource.
-    void
-    Inst_DS__DS_GWS_SEMA_V::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_BR class methods ---
-
-    Inst_DS__DS_GWS_SEMA_BR::Inst_DS__DS_GWS_SEMA_BR(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_br")
-    {
-    } // Inst_DS__DS_GWS_SEMA_BR
-
-    Inst_DS__DS_GWS_SEMA_BR::~Inst_DS__DS_GWS_SEMA_BR()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_BR
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // updating the counter by the bulk release delivered count and labeling
-    // the resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // index =  find first valid (vector mask)
-    // count = DATA[thread: index];
-    // //Add count to the resource state counter
-    // state.counter[rid] += count;
-    // state.type = SEMAPHORE;
-    // return rd_done; //release calling wave
-    // This action will release count number of waves, immediately if queued,
-    // or as they arrive from the noted resource.
-    void
-    Inst_DS__DS_GWS_SEMA_BR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_SEMA_P class methods ---
-
-    Inst_DS__DS_GWS_SEMA_P::Inst_DS__DS_GWS_SEMA_P(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_sema_p")
-    {
-    } // Inst_DS__DS_GWS_SEMA_P
-
-    Inst_DS__DS_GWS_SEMA_P::~Inst_DS__DS_GWS_SEMA_P()
-    {
-    } // ~Inst_DS__DS_GWS_SEMA_P
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // queueing it until counter enables a release and then decrementing the
-    // counter of the resource as a semaphore.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + offset0[5:0];
-    // state.type = SEMAPHORE;
-    // ENQUEUE until(state[rid].counter > 0)
-    // state[rid].counter--;
-    // return rd_done
-    void
-    Inst_DS__DS_GWS_SEMA_P::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_GWS_BARRIER class methods ---
-
-    Inst_DS__DS_GWS_BARRIER::Inst_DS__DS_GWS_BARRIER(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_gws_barrier")
-    {
-    } // Inst_DS__DS_GWS_BARRIER
-
-    Inst_DS__DS_GWS_BARRIER::~Inst_DS__DS_GWS_BARRIER()
-    {
-    } // ~Inst_DS__DS_GWS_BARRIER
-
-    // --- description from .arch file ---
-    // GDS Only: The GWS resource indicated will process this opcode by
-    // queueing it until barrier is satisfied. The number of waves needed is
-    // passed in as DATA of first valid thread.
-    // //Determine the GWS resource to work on
-    // rid[5:0] = SH_SX_EXPCMD.gds_base[5:0] + OFFSET0[5:0];
-    // index =  find first valid (vector mask);
-    // value = DATA[thread: index];
-    // // Input Decision Machine
-    // state.type[rid] = BARRIER;
-    // if(state[rid].counter <= 0) {
-    //     thread[rid].flag = state[rid].flag;
-    //     ENQUEUE;
-    //     state[rid].flag = !state.flag;
-    //     state[rid].counter = value;
-    //     return rd_done;
-    // } else {
-    //     state[rid].counter--;
-    //     thread.flag = state[rid].flag;
-    //     ENQUEUE;
-    // }
-    // Since the waves deliver the count for the next barrier, this function
-    // can have a different size barrier for each occurrence.
-    // // Release Machine
-    // if(state.type == BARRIER) {
-    //     if(state.flag != thread.flag) {
-    //         return rd_done;
-    //     }
-    // }
-    void
-    Inst_DS__DS_GWS_BARRIER::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_CONSUME class methods ---
-
-    Inst_DS__DS_CONSUME::Inst_DS__DS_CONSUME(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_consume")
-    {
-    } // Inst_DS__DS_CONSUME
-
-    Inst_DS__DS_CONSUME::~Inst_DS__DS_CONSUME()
-    {
-    } // ~Inst_DS__DS_CONSUME
-
-    // --- description from .arch file ---
-    // LDS & GDS. Subtract (count_bits(exec_mask)) from the value stored in DS
-    // memory at (M0.base + instr_offset). Return the pre-operation value to
-    // VGPRs.
-    void
-    Inst_DS__DS_CONSUME::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_APPEND class methods ---
-
-    Inst_DS__DS_APPEND::Inst_DS__DS_APPEND(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_append")
-    {
-    } // Inst_DS__DS_APPEND
-
-    Inst_DS__DS_APPEND::~Inst_DS__DS_APPEND()
-    {
-    } // ~Inst_DS__DS_APPEND
-
-    // --- description from .arch file ---
-    // LDS & GDS. Add (count_bits(exec_mask)) to the value stored in DS memory
-    // at (M0.base + instr_offset). Return the pre-operation value to VGPRs.
-    void
-    Inst_DS__DS_APPEND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ORDERED_COUNT class methods ---
-
-    Inst_DS__DS_ORDERED_COUNT::Inst_DS__DS_ORDERED_COUNT(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_ordered_count")
-    {
-    } // Inst_DS__DS_ORDERED_COUNT
-
-    Inst_DS__DS_ORDERED_COUNT::~Inst_DS__DS_ORDERED_COUNT()
-    {
-    } // ~Inst_DS__DS_ORDERED_COUNT
-
-    // --- description from .arch file ---
-    // GDS-only. Add (count_bits(exec_mask)) to one of 4 dedicated
-    // ordered-count counters (aka 'packers'). Additional bits of instr.offset
-    // field are overloaded to hold packer-id, 'last'.
-    void
-    Inst_DS__DS_ORDERED_COUNT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_ADD_SRC2_U64 class methods ---
-
-    Inst_DS__DS_ADD_SRC2_U64::Inst_DS__DS_ADD_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_add_src2_u64")
-    {
-    } // Inst_DS__DS_ADD_SRC2_U64
-
-    Inst_DS__DS_ADD_SRC2_U64::~Inst_DS__DS_ADD_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_ADD_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] + MEM[B].
-    void
-    Inst_DS__DS_ADD_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_SUB_SRC2_U64 class methods ---
-
-    Inst_DS__DS_SUB_SRC2_U64::Inst_DS__DS_SUB_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_sub_src2_u64")
-    {
-    } // Inst_DS__DS_SUB_SRC2_U64
-
-    Inst_DS__DS_SUB_SRC2_U64::~Inst_DS__DS_SUB_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_SUB_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] - MEM[B].
-    void
-    Inst_DS__DS_SUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_RSUB_SRC2_U64 class methods ---
-
-    Inst_DS__DS_RSUB_SRC2_U64::Inst_DS__DS_RSUB_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_rsub_src2_u64")
-    {
-    } // Inst_DS__DS_RSUB_SRC2_U64
-
-    Inst_DS__DS_RSUB_SRC2_U64::~Inst_DS__DS_RSUB_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_RSUB_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B] - MEM[A].
-    void
-    Inst_DS__DS_RSUB_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_INC_SRC2_U64 class methods ---
-
-    Inst_DS__DS_INC_SRC2_U64::Inst_DS__DS_INC_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_inc_src2_u64")
-    {
-    } // Inst_DS__DS_INC_SRC2_U64
-
-    Inst_DS__DS_INC_SRC2_U64::~Inst_DS__DS_INC_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_INC_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] >= MEM[B] ? 0 : MEM[A] + 1).
-    void
-    Inst_DS__DS_INC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_DEC_SRC2_U64 class methods ---
-
-    Inst_DS__DS_DEC_SRC2_U64::Inst_DS__DS_DEC_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_dec_src2_u64")
-    {
-    } // Inst_DS__DS_DEC_SRC2_U64
-
-    Inst_DS__DS_DEC_SRC2_U64::~Inst_DS__DS_DEC_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_DEC_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[A] == 0 || MEM[A] > MEM[B] ? MEM[B] : MEM[A] - 1).
-    // Uint decrement.
-    void
-    Inst_DS__DS_DEC_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_I64 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_I64::Inst_DS__DS_MIN_SRC2_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_i64")
-    {
-    } // Inst_DS__DS_MIN_SRC2_I64
-
-    Inst_DS__DS_MIN_SRC2_I64::~Inst_DS__DS_MIN_SRC2_I64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_I64 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_I64::Inst_DS__DS_MAX_SRC2_I64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_i64")
-    {
-    } // Inst_DS__DS_MAX_SRC2_I64
-
-    Inst_DS__DS_MAX_SRC2_I64::~Inst_DS__DS_MAX_SRC2_I64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_I64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_I64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_U64 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_U64::Inst_DS__DS_MIN_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_u64")
-    {
-    } // Inst_DS__DS_MIN_SRC2_U64
-
-    Inst_DS__DS_MIN_SRC2_U64::~Inst_DS__DS_MIN_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = min(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MIN_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_U64 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_U64::Inst_DS__DS_MAX_SRC2_U64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_u64")
-    {
-    } // Inst_DS__DS_MAX_SRC2_U64
-
-    Inst_DS__DS_MAX_SRC2_U64::~Inst_DS__DS_MAX_SRC2_U64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_U64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = max(MEM[A], MEM[B]).
-    void
-    Inst_DS__DS_MAX_SRC2_U64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_AND_SRC2_B64 class methods ---
-
-    Inst_DS__DS_AND_SRC2_B64::Inst_DS__DS_AND_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_and_src2_b64")
-    {
-    } // Inst_DS__DS_AND_SRC2_B64
-
-    Inst_DS__DS_AND_SRC2_B64::~Inst_DS__DS_AND_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_AND_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] & MEM[B].
-    void
-    Inst_DS__DS_AND_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_OR_SRC2_B64 class methods ---
-
-    Inst_DS__DS_OR_SRC2_B64::Inst_DS__DS_OR_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_or_src2_b64")
-    {
-    } // Inst_DS__DS_OR_SRC2_B64
-
-    Inst_DS__DS_OR_SRC2_B64::~Inst_DS__DS_OR_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_OR_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] | MEM[B].
-    void
-    Inst_DS__DS_OR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_XOR_SRC2_B64 class methods ---
-
-    Inst_DS__DS_XOR_SRC2_B64::Inst_DS__DS_XOR_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_xor_src2_b64")
-    {
-    } // Inst_DS__DS_XOR_SRC2_B64
-
-    Inst_DS__DS_XOR_SRC2_B64::~Inst_DS__DS_XOR_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_XOR_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[A] ^ MEM[B].
-    void
-    Inst_DS__DS_XOR_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_SRC2_B64 class methods ---
-
-    Inst_DS__DS_WRITE_SRC2_B64::Inst_DS__DS_WRITE_SRC2_B64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_src2_b64")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_SRC2_B64
-
-    Inst_DS__DS_WRITE_SRC2_B64::~Inst_DS__DS_WRITE_SRC2_B64()
-    {
-    } // ~Inst_DS__DS_WRITE_SRC2_B64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = MEM[B].
-    // Write qword.
-    void
-    Inst_DS__DS_WRITE_SRC2_B64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MIN_SRC2_F64 class methods ---
-
-    Inst_DS__DS_MIN_SRC2_F64::Inst_DS__DS_MIN_SRC2_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_min_src2_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MIN_SRC2_F64
-
-    Inst_DS__DS_MIN_SRC2_F64::~Inst_DS__DS_MIN_SRC2_F64()
-    {
-    } // ~Inst_DS__DS_MIN_SRC2_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] < MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MIN_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_MAX_SRC2_F64 class methods ---
-
-    Inst_DS__DS_MAX_SRC2_F64::Inst_DS__DS_MAX_SRC2_F64(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_max_src2_f64")
-    {
-        setFlag(F64);
-    } // Inst_DS__DS_MAX_SRC2_F64
-
-    Inst_DS__DS_MAX_SRC2_F64::~Inst_DS__DS_MAX_SRC2_F64()
-    {
-    } // ~Inst_DS__DS_MAX_SRC2_F64
-
-    // --- description from .arch file ---
-    // 64b:
-    // A = ADDR_BASE;
-    // B = A + 4*(offset1[7] ? {A[31],A[31:17]} :
-    // ---  {offset1[6],offset1[6:0],offset0});
-    // MEM[A] = (MEM[B] > MEM[A]) ? MEM[B] : MEM[A].
-    // Float, handles NaN/INF/denorm.
-    void
-    Inst_DS__DS_MAX_SRC2_F64::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_DS__DS_WRITE_B96 class methods ---
-
-    Inst_DS__DS_WRITE_B96::Inst_DS__DS_WRITE_B96(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b96")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B96
-
-    Inst_DS__DS_WRITE_B96::~Inst_DS__DS_WRITE_B96()
-    {
-    } // ~Inst_DS__DS_WRITE_B96
-
-    // --- description from .arch file ---
-    // {MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[95:0].
-    // Tri-dword write.
-    void
-    Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
-
-        addr.read();
-        data0.read();
-        data1.read();
-        data2.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<3>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_WRITE_B128 class methods ---
-
-    Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_write_b128")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_DS__DS_WRITE_B128
-
-    Inst_DS__DS_WRITE_B128::~Inst_DS__DS_WRITE_B128()
-    {
-    } // ~Inst_DS__DS_WRITE_B128
-
-    // --- description from .arch file ---
-    // {MEM[ADDR + 12], MEM[ADDR + 8], MEM[ADDR + 4], MEM[ADDR]} = DATA[127:0].
-    // Qword write.
-    void
-    Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
-
-        addr.read();
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemWrite<4>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_DS__DS_READ_B96 class methods ---
-
-    Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b96")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B96
-
-    Inst_DS__DS_READ_B96::~Inst_DS__DS_READ_B96()
-    {
-    } // ~Inst_DS__DS_READ_B96
-
-    // --- description from .arch file ---
-    // Tri-dword read.
-    void
-    Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<3>(gpuDynInst, offset);
-    }
-
-    void
-    Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    }
-    // --- Inst_DS__DS_READ_B128 class methods ---
-
-    Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
-        : Inst_DS(iFmt, "ds_read_b128")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_DS__DS_READ_B128
-
-    Inst_DS__DS_READ_B128::~Inst_DS__DS_READ_B128()
-    {
-    } // ~Inst_DS__DS_READ_B128
-
-    // --- description from .arch file ---
-    // Qword read.
-    void
-    Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(
-                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
-        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
-
-        addr.read();
-
-        calcAddr(gpuDynInst, addr);
-
-        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        Addr offset0 = instData.OFFSET0;
-        Addr offset1 = instData.OFFSET1;
-        Addr offset = (offset1 << 8) | offset0;
-
-        initMemRead<4>(gpuDynInst, offset);
-    } // initiateAcc
-
-    void
-    Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_X class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::~Inst_MUBUF__BUFFER_LOAD_FORMAT_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_X
-
-    // --- description from .arch file ---
-    // Untyped buffer load 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XY class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_X class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_X
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::~Inst_MUBUF__BUFFER_STORE_FORMAT_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_X
-
-    // --- description from .arch file ---
-    // Untyped buffer store 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XY class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::~Inst_MUBUF__BUFFER_STORE_FORMAT_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Untyped buffer load 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_x")
-    {
-        setFlag(Store);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Untyped buffer store 1 dword with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords with format conversion.
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_UBYTE class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_UBYTE
-        ::Inst_MUBUF__BUFFER_LOAD_UBYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_ubyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_UBYTE
-
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::~Inst_MUBUF__BUFFER_LOAD_UBYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_UBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
-                        gpuDynInst->d_data))[lane]);
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-
-    // --- Inst_MUBUF__BUFFER_LOAD_SBYTE class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_SBYTE
-        ::Inst_MUBUF__BUFFER_LOAD_SBYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_sbyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_SBYTE
-
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::~Inst_MUBUF__BUFFER_LOAD_SBYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_SBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed byte (sign extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_USHORT class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_USHORT
-        ::Inst_MUBUF__BUFFER_LOAD_USHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_ushort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_USHORT
-
-    Inst_MUBUF__BUFFER_LOAD_USHORT::~Inst_MUBUF__BUFFER_LOAD_USHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_USHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned short (zero extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
-                        gpuDynInst->d_data))[lane]);
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    } // execute
-
-    // --- Inst_MUBUF__BUFFER_LOAD_SSHORT class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_SSHORT
-        ::Inst_MUBUF__BUFFER_LOAD_SSHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_sshort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_LOAD_SSHORT
-
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::~Inst_MUBUF__BUFFER_LOAD_SSHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_SSHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed short (sign extend to VGPR destination).
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORD class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORD
-        ::Inst_MUBUF__BUFFER_LOAD_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORD
-
-    Inst_MUBUF__BUFFER_LOAD_DWORD::~Inst_MUBUF__BUFFER_LOAD_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer load dword.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                } else {
-                    vdst[lane] = 0;
-                }
-            }
-        }
-
-        vdst.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX2 class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX2
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 2];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 2 + 1];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX3 class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX3(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX3
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3 + 1];
-                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 3 + 2];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                    vdst2[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX4 class methods ---
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4
-        ::Inst_MUBUF__BUFFER_LOAD_DWORDX4(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_LOAD_DWORDX4
-
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4()
-    {
-    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords.
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-
-        rsrcDesc.read();
-        offset.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                if (!oobMask[lane]) {
-                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4];
-                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 1];
-                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 2];
-                    vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane * 4 + 3];
-                } else {
-                    vdst0[lane] = 0;
-                    vdst1[lane] = 0;
-                    vdst2[lane] = 0;
-                    vdst3[lane] = 0;
-                }
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_BYTE class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_BYTE
-        ::Inst_MUBUF__BUFFER_STORE_BYTE(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_byte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_BYTE
-
-    Inst_MUBUF__BUFFER_STORE_BYTE::~Inst_MUBUF__BUFFER_STORE_BYTE()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_BYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer store byte.
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-       gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemI8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemI8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_SHORT class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_SHORT
-        ::Inst_MUBUF__BUFFER_STORE_SHORT(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_short")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_SHORT
-
-    Inst_MUBUF__BUFFER_STORE_SHORT::~Inst_MUBUF__BUFFER_STORE_SHORT()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_SHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer store short.
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandI16 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemI16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemI16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MUBUF__BUFFER_STORE_DWORD class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORD::
-        Inst_MUBUF__BUFFER_STORE_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORD
-
-    Inst_MUBUF__BUFFER_STORE_DWORD::~Inst_MUBUF__BUFFER_STORE_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer store dword.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data(gpuDynInst, extData.VDATA);
-
-        rsrcDesc.read();
-        offset.read();
-        data.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_DWORDX2 class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX2
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX2
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<2>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_DWORDX3 class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX3
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX3(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX3
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-        data2.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
-                    = data2[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_DWORDX4 class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX4
-        ::Inst_MUBUF__BUFFER_STORE_DWORDX4(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        if (instData.LDS) {
-            setFlag(GroupSegment);
-        } else {
-            setFlag(GlobalSegment);
-        }
-    } // Inst_MUBUF__BUFFER_STORE_DWORDX4
-
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords.
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3);
-
-        rsrcDesc.read();
-        offset.read();
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
-                    = data0[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
-                    = data1[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
-                    = data2[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 3]
-                    = data3[lane];
-            }
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_STORE_LDS_DWORD class methods ---
-
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-        ::Inst_MUBUF__BUFFER_STORE_LDS_DWORD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_store_lds_dword")
-    {
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::~Inst_MUBUF__BUFFER_STORE_LDS_DWORD()
-    {
-    } // ~Inst_MUBUF__BUFFER_STORE_LDS_DWORD
-
-    // --- description from .arch file ---
-    // Store one DWORD from LDS memory to system memory without utilizing
-    // VGPRs.
-    void
-    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_WBINVL1 class methods ---
-
-    Inst_MUBUF__BUFFER_WBINVL1::Inst_MUBUF__BUFFER_WBINVL1(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_wbinvl1")
-    {
-        setFlag(MemoryRef);
-        setFlag(GPUStaticInst::MemSync);
-        setFlag(GlobalSegment);
-        setFlag(MemSync);
-    } // Inst_MUBUF__BUFFER_WBINVL1
-
-    Inst_MUBUF__BUFFER_WBINVL1::~Inst_MUBUF__BUFFER_WBINVL1()
-    {
-    } // ~Inst_MUBUF__BUFFER_WBINVL1
-
-    // --- description from .arch file ---
-    // Write back and invalidate the shader L1.
-    // Always returns ACK to shader.
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        // TODO: Fix it for gfx10. Once we have the new gfx10 cache model, we
-        // need to precisely communicate the writeback-invalidate operation to
-        // the new gfx10 coalescer rather than sending AcquireRelease markers.
-        // The SICoalescer would need to be updated appropriately as well.
-        injectGlobalMemFence(gpuDynInst);
-    } // initiateAcc
-    void
-    Inst_MUBUF__BUFFER_WBINVL1::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_WBINVL1_VOL class methods ---
-
-    Inst_MUBUF__BUFFER_WBINVL1_VOL
-        ::Inst_MUBUF__BUFFER_WBINVL1_VOL(InFmt_MUBUF*iFmt)
-        : Inst_MUBUF(iFmt, "buffer_wbinvl1_vol") {
-        // This instruction is same as buffer_wbinvl1 instruction except this
-        // instruction only invalidate L1 shader line with MTYPE SC and GC.
-        // Since Hermes L1 (TCP) do not differentiate between its cache lines,
-        // this instruction currently behaves (and implemented ) exactly like
-        // buffer_wbinvl1 instruction.
-        setFlag(MemoryRef);
-        setFlag(GPUStaticInst::MemSync);
-        setFlag(GlobalSegment);
-        setFlag(MemSync);
-    } // Inst_MUBUF__BUFFER_WBINVL1_VOL
-
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::~Inst_MUBUF__BUFFER_WBINVL1_VOL()
-    {
-    } // ~Inst_MUBUF__BUFFER_WBINVL1_VOL
-
-    // --- description from .arch file ---
-    // Write back and invalidate the shader L1 only for lines that are marked
-    // ---  volatile.
-    // Always returns ACK to shader.
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
-            gpuDynInst->computeUnit()->globalMemoryPipe.
-                issueRequest(gpuDynInst);
-        } else {
-            fatal("Unsupported scope for flat instruction.\n");
-        }
-    } // execute
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        injectGlobalMemFence(gpuDynInst);
-    } // initiateAcc
-    void
-    Inst_MUBUF__BUFFER_WBINVL1_VOL::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP
-        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP::~Inst_MUBUF__BUFFER_ATOMIC_SWAP()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
-        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
-        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
-        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
-        ConstVecOperandU32 src(gpuDynInst, extData.VDATA);
-        ConstVecOperandU32 cmp(gpuDynInst, extData.VDATA + 1);
-
-        rsrcDesc.read();
-        offset.read();
-        src.read();
-        cmp.read();
-
-        int inst_offset = instData.OFFSET;
-
-        if (!instData.IDXEN && !instData.OFFEN) {
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (!instData.IDXEN && instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr0, addr1, rsrcDesc, offset, inst_offset);
-        } else if (instData.IDXEN && !instData.OFFEN) {
-            addr0.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        } else {
-            addr0.read();
-            addr1.read();
-            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
-                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
-                    addr1, addr0, rsrcDesc, offset, inst_offset);
-        }
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->x_data))[lane]
-                    = src[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = cmp[lane];
-            }
-        }
-
-        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
-    } // execute
-
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDATA);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD
-        ::Inst_MUBUF__BUFFER_ATOMIC_ADD(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_ADD
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD::~Inst_MUBUF__BUFFER_ATOMIC_ADD()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB
-        ::Inst_MUBUF__BUFFER_ATOMIC_SUB(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SUB
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB::~Inst_MUBUF__BUFFER_ATOMIC_SUB()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN::~Inst_MUBUF__BUFFER_ATOMIC_SMIN()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN::~Inst_MUBUF__BUFFER_ATOMIC_UMIN()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX::~Inst_MUBUF__BUFFER_ATOMIC_SMAX()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX::~Inst_MUBUF__BUFFER_ATOMIC_UMAX()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_AND class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND
-        ::Inst_MUBUF__BUFFER_ATOMIC_AND(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_AND
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND::~Inst_MUBUF__BUFFER_ATOMIC_AND()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_OR class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR
-        ::Inst_MUBUF__BUFFER_ATOMIC_OR(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_OR
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR::~Inst_MUBUF__BUFFER_ATOMIC_OR()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR
-        ::Inst_MUBUF__BUFFER_ATOMIC_XOR(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_XOR
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR::~Inst_MUBUF__BUFFER_ATOMIC_XOR()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_INC class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC
-        ::Inst_MUBUF__BUFFER_ATOMIC_INC(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_INC
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC::~Inst_MUBUF__BUFFER_ATOMIC_INC()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC
-        ::Inst_MUBUF__BUFFER_ATOMIC_DEC(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_DEC
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC::~Inst_MUBUF__BUFFER_ATOMIC_DEC()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_swap_x2")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap_x2")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-        ::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0:1];
-    // cmp = DATA[2:3];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_ADD_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_add_x2")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SUB_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_sub_x2")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_smax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_umax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_AND_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_AND_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_and_x2")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::~Inst_MUBUF__BUFFER_ATOMIC_AND_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_OR_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_OR_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_or_x2")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-    } // Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::~Inst_MUBUF__BUFFER_ATOMIC_OR_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_XOR_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_xor_x2")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_INC_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_INC_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_inc_x2")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::~Inst_MUBUF__BUFFER_ATOMIC_INC_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC_X2 class methods ---
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-        ::Inst_MUBUF__BUFFER_ATOMIC_DEC_X2(InFmt_MUBUF *iFmt)
-        : Inst_MUBUF(iFmt, "buffer_atomic_dec_x2")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2()
-    {
-    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_X class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
-
-    // --- description from .arch file ---
-    // Typed buffer load 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Typed buffer load 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer load 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer load 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_X class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::~Inst_MTBUF__TBUFFER_STORE_FORMAT_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X
-
-    // --- description from .arch file ---
-    // Typed buffer store 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
-
-    // --- description from .arch file ---
-    // Typed buffer store 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer store 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer store 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::
-        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Typed buffer load 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::initiateAcc(
-          GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Typed buffer load 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(
-          InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer load 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(
-          InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer load 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_x")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
-
-    // --- description from .arch file ---
-    // Typed buffer store 1 dword with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xy")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
-
-    // --- description from .arch file ---
-    // Typed buffer store 2 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyz")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
-
-    // --- description from .arch file ---
-    // Typed buffer store 3 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
-          GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW class methods ---
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF *iFmt)
-        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyzw")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW()
-    {
-    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
-
-    // --- description from .arch file ---
-    // Typed buffer store 4 dwords with format conversion.
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::execute(
-        GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
-        GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD class methods ---
-
-    Inst_MIMG__IMAGE_LOAD::Inst_MIMG__IMAGE_LOAD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD
-
-    Inst_MIMG__IMAGE_LOAD::~Inst_MIMG__IMAGE_LOAD()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD
-
-    // --- description from .arch file ---
-    // Image memory load with format conversion specified in T#. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_MIP class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_MIP::Inst_MIMG__IMAGE_LOAD_MIP(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP
-
-    Inst_MIMG__IMAGE_LOAD_MIP::~Inst_MIMG__IMAGE_LOAD_MIP()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP
-
-    // --- description from .arch file ---
-    // Image memory load with user-supplied mip level. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_PCK class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_PCK::Inst_MIMG__IMAGE_LOAD_PCK(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_PCK
-
-    Inst_MIMG__IMAGE_LOAD_PCK::~Inst_MIMG__IMAGE_LOAD_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_PCK
-
-    // --- description from .arch file ---
-    // Image memory load with no format conversion. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_PCK_SGN class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::Inst_MIMG__IMAGE_LOAD_PCK_SGN(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_pck_sgn")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_PCK_SGN
-
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_PCK_SGN()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_PCK_SGN
-
-    // --- description from .arch file ---
-    // Image memory load with with no format conversion and sign extension. No
-    // ---  sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::Inst_MIMG__IMAGE_LOAD_MIP_PCK(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::~Inst_MIMG__IMAGE_LOAD_MIP_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK
-
-    // --- description from .arch file ---
-    // Image memory load with user-supplied mip level, no format conversion. No
-    // ---  sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN class methods ---
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_load_mip_pck_sgn")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
-
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN()
-    {
-    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
-
-    // --- description from .arch file ---
-    // Image memory load with user-supplied mip level, no format conversion and
-    // ---  with sign extension. No sampler.
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE class methods ---
-
-    Inst_MIMG__IMAGE_STORE::Inst_MIMG__IMAGE_STORE(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE
-
-    Inst_MIMG__IMAGE_STORE::~Inst_MIMG__IMAGE_STORE()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE
-
-    // --- description from .arch file ---
-    // Image memory store with format conversion specified in T#. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE_MIP class methods ---
-
-    Inst_MIMG__IMAGE_STORE_MIP::Inst_MIMG__IMAGE_STORE_MIP(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_mip")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_MIP
-
-    Inst_MIMG__IMAGE_STORE_MIP::~Inst_MIMG__IMAGE_STORE_MIP()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_MIP
-
-    // --- description from .arch file ---
-    // Image memory store with format conversion specified in T# to user
-    // specified mip level. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE_PCK class methods ---
-
-    Inst_MIMG__IMAGE_STORE_PCK::Inst_MIMG__IMAGE_STORE_PCK(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_PCK
-
-    Inst_MIMG__IMAGE_STORE_PCK::~Inst_MIMG__IMAGE_STORE_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_PCK
-
-    // --- description from .arch file ---
-    // Image memory store of packed data without format conversion. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_STORE_MIP_PCK class methods ---
-
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::Inst_MIMG__IMAGE_STORE_MIP_PCK(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_store_mip_pck")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_STORE_MIP_PCK
-
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::~Inst_MIMG__IMAGE_STORE_MIP_PCK()
-    {
-    } // ~Inst_MIMG__IMAGE_STORE_MIP_PCK
-
-    // --- description from .arch file ---
-    // Image memory store of packed data without format conversion to
-    // user-supplied mip level. No sampler.
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_MIMG__IMAGE_STORE_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_MIMG__IMAGE_GET_RESINFO class methods ---
-
-    Inst_MIMG__IMAGE_GET_RESINFO::Inst_MIMG__IMAGE_GET_RESINFO(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_get_resinfo")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GET_RESINFO
-
-    Inst_MIMG__IMAGE_GET_RESINFO::~Inst_MIMG__IMAGE_GET_RESINFO()
-    {
-    } // ~Inst_MIMG__IMAGE_GET_RESINFO
-
-    // --- description from .arch file ---
-    // return resource info for a given mip level specified in the address
-    // vgpr. No sampler. Returns 4 integer values into VGPRs 3-0:
-    // {num_mip_levels, depth, height, width}.
-    void
-    Inst_MIMG__IMAGE_GET_RESINFO::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SWAP class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::Inst_MIMG__IMAGE_ATOMIC_SWAP(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SWAP
-
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::~Inst_MIMG__IMAGE_ATOMIC_SWAP()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_CMPSWAP class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::Inst_MIMG__IMAGE_ATOMIC_CMPSWAP(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
-
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_ADD class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_ADD::Inst_MIMG__IMAGE_ATOMIC_ADD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_ADD
-
-    Inst_MIMG__IMAGE_ATOMIC_ADD::~Inst_MIMG__IMAGE_ATOMIC_ADD()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_ADD
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SUB class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SUB::Inst_MIMG__IMAGE_ATOMIC_SUB(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SUB
-
-    Inst_MIMG__IMAGE_ATOMIC_SUB::~Inst_MIMG__IMAGE_ATOMIC_SUB()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SUB
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SMIN class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::Inst_MIMG__IMAGE_ATOMIC_SMIN(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SMIN
-
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::~Inst_MIMG__IMAGE_ATOMIC_SMIN()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_UMIN class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::Inst_MIMG__IMAGE_ATOMIC_UMIN(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_UMIN
-
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::~Inst_MIMG__IMAGE_ATOMIC_UMIN()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_UMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_SMAX class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::Inst_MIMG__IMAGE_ATOMIC_SMAX(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_SMAX
-
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::~Inst_MIMG__IMAGE_ATOMIC_SMAX()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_SMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_UMAX class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::Inst_MIMG__IMAGE_ATOMIC_UMAX(
-        InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_UMAX
-
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::~Inst_MIMG__IMAGE_ATOMIC_UMAX()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_UMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_AND class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_AND::Inst_MIMG__IMAGE_ATOMIC_AND(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_AND
-
-    Inst_MIMG__IMAGE_ATOMIC_AND::~Inst_MIMG__IMAGE_ATOMIC_AND()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_AND
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_OR class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_OR::Inst_MIMG__IMAGE_ATOMIC_OR(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_OR
-
-    Inst_MIMG__IMAGE_ATOMIC_OR::~Inst_MIMG__IMAGE_ATOMIC_OR()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_OR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_XOR class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_XOR::Inst_MIMG__IMAGE_ATOMIC_XOR(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_XOR
-
-    Inst_MIMG__IMAGE_ATOMIC_XOR::~Inst_MIMG__IMAGE_ATOMIC_XOR()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_XOR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_INC class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_INC::Inst_MIMG__IMAGE_ATOMIC_INC(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_INC
-
-    Inst_MIMG__IMAGE_ATOMIC_INC::~Inst_MIMG__IMAGE_ATOMIC_INC()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_INC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_ATOMIC_DEC class methods ---
-
-    Inst_MIMG__IMAGE_ATOMIC_DEC::Inst_MIMG__IMAGE_ATOMIC_DEC(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_ATOMIC_DEC
-
-    Inst_MIMG__IMAGE_ATOMIC_DEC::~Inst_MIMG__IMAGE_ATOMIC_DEC()
-    {
-    } // ~Inst_MIMG__IMAGE_ATOMIC_DEC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_MIMG__IMAGE_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE::Inst_MIMG__IMAGE_SAMPLE(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample")
-    {
-    } // Inst_MIMG__IMAGE_SAMPLE
-
-    Inst_MIMG__IMAGE_SAMPLE::~Inst_MIMG__IMAGE_SAMPLE()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE
-
-    // --- description from .arch file ---
-    // sample texture map.
-    void
-    Inst_MIMG__IMAGE_SAMPLE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CL::Inst_MIMG__IMAGE_SAMPLE_CL(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_CL::~Inst_MIMG__IMAGE_SAMPLE_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D::Inst_MIMG__IMAGE_SAMPLE_D(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D
-
-    Inst_MIMG__IMAGE_SAMPLE_D::~Inst_MIMG__IMAGE_SAMPLE_D()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D
-
-    // --- description from .arch file ---
-    // sample texture map, with user derivatives
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::Inst_MIMG__IMAGE_SAMPLE_D_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::~Inst_MIMG__IMAGE_SAMPLE_D_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader, with user
-    // ---  derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_L class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_L::Inst_MIMG__IMAGE_SAMPLE_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_L
-
-    Inst_MIMG__IMAGE_SAMPLE_L::~Inst_MIMG__IMAGE_SAMPLE_L()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_L
-
-    // --- description from .arch file ---
-    // sample texture map, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B::Inst_MIMG__IMAGE_SAMPLE_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B
-
-    Inst_MIMG__IMAGE_SAMPLE_B::~Inst_MIMG__IMAGE_SAMPLE_B()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B
-
-    // --- description from .arch file ---
-    // sample texture map, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::Inst_MIMG__IMAGE_SAMPLE_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::~Inst_MIMG__IMAGE_SAMPLE_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_LZ class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ::Inst_MIMG__IMAGE_SAMPLE_LZ(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_LZ
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ::~Inst_MIMG__IMAGE_SAMPLE_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ
-
-    // --- description from .arch file ---
-    // sample texture map, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C::Inst_MIMG__IMAGE_SAMPLE_C(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C
-
-    Inst_MIMG__IMAGE_SAMPLE_C::~Inst_MIMG__IMAGE_SAMPLE_C()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C
-
-    // --- description from .arch file ---
-    // sample texture map, with PCF.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::Inst_MIMG__IMAGE_SAMPLE_C_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D::Inst_MIMG__IMAGE_SAMPLE_C_D(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D::~Inst_MIMG__IMAGE_SAMPLE_C_D()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::Inst_MIMG__IMAGE_SAMPLE_C_D_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_L class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L::Inst_MIMG__IMAGE_SAMPLE_C_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_L
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L::~Inst_MIMG__IMAGE_SAMPLE_C_L()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B::Inst_MIMG__IMAGE_SAMPLE_C_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B::~Inst_MIMG__IMAGE_SAMPLE_C_B()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::Inst_MIMG__IMAGE_SAMPLE_C_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::Inst_MIMG__IMAGE_SAMPLE_C_LZ(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::~Inst_MIMG__IMAGE_SAMPLE_C_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ
-
-    // --- description from .arch file ---
-    // SAMPLE_C, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_O::Inst_MIMG__IMAGE_SAMPLE_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_O
-
-    Inst_MIMG__IMAGE_SAMPLE_O::~Inst_MIMG__IMAGE_SAMPLE_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_O
-
-    // --- description from .arch file ---
-    // sample texture map, with user offsets.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::Inst_MIMG__IMAGE_SAMPLE_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D_O::Inst_MIMG__IMAGE_SAMPLE_D_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_O
-
-    Inst_MIMG__IMAGE_SAMPLE_D_O::~Inst_MIMG__IMAGE_SAMPLE_D_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_D_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_d_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_D_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_D_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_L_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_L_O::Inst_MIMG__IMAGE_SAMPLE_L_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_L_O
-
-    Inst_MIMG__IMAGE_SAMPLE_L_O::~Inst_MIMG__IMAGE_SAMPLE_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_L_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B_O::Inst_MIMG__IMAGE_SAMPLE_B_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_O
-
-    Inst_MIMG__IMAGE_SAMPLE_B_O::~Inst_MIMG__IMAGE_SAMPLE_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_B_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::Inst_MIMG__IMAGE_SAMPLE_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_LZ_O
-
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_O::Inst_MIMG__IMAGE_SAMPLE_C_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_O::~Inst_MIMG__IMAGE_SAMPLE_C_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C with user specified offsets.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::Inst_MIMG__IMAGE_SAMPLE_C_D_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_d_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_L_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::Inst_MIMG__IMAGE_SAMPLE_C_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_L_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::~Inst_MIMG__IMAGE_SAMPLE_C_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with user LOD.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::Inst_MIMG__IMAGE_SAMPLE_C_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader, with lod bias.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::Inst_MIMG__IMAGE_SAMPLE_C_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, from level 0.
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4 class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4::Inst_MIMG__IMAGE_GATHER4(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4
-
-    Inst_MIMG__IMAGE_GATHER4::~Inst_MIMG__IMAGE_GATHER4()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2).
-    void
-    Inst_MIMG__IMAGE_GATHER4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_CL::Inst_MIMG__IMAGE_GATHER4_CL(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_CL
-
-    Inst_MIMG__IMAGE_GATHER4_CL::~Inst_MIMG__IMAGE_GATHER4_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD clamp.
-    void
-    Inst_MIMG__IMAGE_GATHER4_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_L class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_L::Inst_MIMG__IMAGE_GATHER4_L(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_L
-
-    Inst_MIMG__IMAGE_GATHER4_L::~Inst_MIMG__IMAGE_GATHER4_L()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_L
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD.
-    void
-    Inst_MIMG__IMAGE_GATHER4_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B::Inst_MIMG__IMAGE_GATHER4_B(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B
-
-    Inst_MIMG__IMAGE_GATHER4_B::~Inst_MIMG__IMAGE_GATHER4_B()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL::Inst_MIMG__IMAGE_GATHER4_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_CL
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL::~Inst_MIMG__IMAGE_GATHER4_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias and clamp.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_LZ class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_LZ::Inst_MIMG__IMAGE_GATHER4_LZ(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_LZ
-
-    Inst_MIMG__IMAGE_GATHER4_LZ::~Inst_MIMG__IMAGE_GATHER4_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_LZ
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) at level 0.
-    void
-    Inst_MIMG__IMAGE_GATHER4_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C::Inst_MIMG__IMAGE_GATHER4_C(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C
-
-    Inst_MIMG__IMAGE_GATHER4_C::~Inst_MIMG__IMAGE_GATHER4_C()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL::Inst_MIMG__IMAGE_GATHER4_C_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_CL
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL::~Inst_MIMG__IMAGE_GATHER4_C_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD clamp and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_L class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_L::Inst_MIMG__IMAGE_GATHER4_C_L(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_l")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_L
-
-    Inst_MIMG__IMAGE_GATHER4_C_L::~Inst_MIMG__IMAGE_GATHER4_C_L()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_L
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user LOD and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_L::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B::Inst_MIMG__IMAGE_GATHER4_C_B(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B
-
-    Inst_MIMG__IMAGE_GATHER4_C_B::~Inst_MIMG__IMAGE_GATHER4_C_B()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::Inst_MIMG__IMAGE_GATHER4_C_B_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::~Inst_MIMG__IMAGE_GATHER4_C_B_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) with user bias, clamp and PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::Inst_MIMG__IMAGE_GATHER4_C_LZ(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_lz")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_LZ
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::~Inst_MIMG__IMAGE_GATHER4_C_LZ()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ
-
-    // --- description from .arch file ---
-    // gather 4 single component elements (2x2) at level 0, with PCF.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_LZ::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_O::Inst_MIMG__IMAGE_GATHER4_O(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_O
-
-    Inst_MIMG__IMAGE_GATHER4_O::~Inst_MIMG__IMAGE_GATHER4_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_O
-
-    // --- description from .arch file ---
-    // GATHER4, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_CL_O::Inst_MIMG__IMAGE_GATHER4_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_CL_O::~Inst_MIMG__IMAGE_GATHER4_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_L_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_L_O::Inst_MIMG__IMAGE_GATHER4_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_L_O
-
-    Inst_MIMG__IMAGE_GATHER4_L_O::~Inst_MIMG__IMAGE_GATHER4_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_L_O
-
-    // --- description from .arch file ---
-    // GATHER4_L, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B_O::Inst_MIMG__IMAGE_GATHER4_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_O
-
-    Inst_MIMG__IMAGE_GATHER4_B_O::~Inst_MIMG__IMAGE_GATHER4_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_O
-
-    // --- description from .arch file ---
-    // GATHER4_B, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::Inst_MIMG__IMAGE_GATHER4_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_B_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_B_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::Inst_MIMG__IMAGE_GATHER4_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_LZ_O
-
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::~Inst_MIMG__IMAGE_GATHER4_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_LZ_O
-
-    // --- description from .arch file ---
-    // GATHER4_LZ, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_O::Inst_MIMG__IMAGE_GATHER4_C_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_O::~Inst_MIMG__IMAGE_GATHER4_C_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_O
-
-    // --- description from .arch file ---
-    // GATHER4_C, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::Inst_MIMG__IMAGE_GATHER4_C_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_C_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_L_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::Inst_MIMG__IMAGE_GATHER4_C_L_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_l_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_L_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::~Inst_MIMG__IMAGE_GATHER4_C_L_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_L_O
-
-    // --- description from .arch file ---
-    // GATHER4_C_L, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_L_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::Inst_MIMG__IMAGE_GATHER4_C_B_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::~Inst_MIMG__IMAGE_GATHER4_C_B_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_O
-
-    // --- description from .arch file ---
-    // GATHER4_B, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::Inst_MIMG__IMAGE_GATHER4_C_B_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_b_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
-
-    // --- description from .arch file ---
-    // GATHER4_B_CL, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ_O class methods ---
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::Inst_MIMG__IMAGE_GATHER4_C_LZ_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_gather4_c_lz_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GATHER4_C_LZ_O
-
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::~Inst_MIMG__IMAGE_GATHER4_C_LZ_O()
-    {
-    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ_O
-
-    // --- description from .arch file ---
-    // GATHER4_C_LZ, with user offsets.
-    void
-    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_GET_LOD class methods ---
-
-    Inst_MIMG__IMAGE_GET_LOD::Inst_MIMG__IMAGE_GET_LOD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_get_lod")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_GET_LOD
-
-    Inst_MIMG__IMAGE_GET_LOD::~Inst_MIMG__IMAGE_GET_LOD()
-    {
-    } // ~Inst_MIMG__IMAGE_GET_LOD
-
-    // --- description from .arch file ---
-    // Return calculated LOD. Vdata gets 2 32bit integer values: { rawLOD,
-    // ---  clampedLOD }.
-    void
-    Inst_MIMG__IMAGE_GET_LOD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD::Inst_MIMG__IMAGE_SAMPLE_CD(InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD
-
-    Inst_MIMG__IMAGE_SAMPLE_CD::~Inst_MIMG__IMAGE_SAMPLE_CD()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD
-
-    // --- description from .arch file ---
-    // sample texture map, with user derivatives (LOD per quad)
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::Inst_MIMG__IMAGE_SAMPLE_CD_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_CD_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL
-
-    // --- description from .arch file ---
-    // sample texture map, with LOD clamp specified in shader, with user
-    // ---  derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::Inst_MIMG__IMAGE_SAMPLE_C_CD(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::~Inst_MIMG__IMAGE_SAMPLE_C_CD()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with user derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_cl")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
-
-    // --- description from .arch file ---
-    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives
-    // (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::Inst_MIMG__IMAGE_SAMPLE_CD_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::~Inst_MIMG__IMAGE_SAMPLE_CD_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with user derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_CD_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_cd_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives
-    // (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with user derivatives (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O class methods ---
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O(
-          InFmt_MIMG *iFmt)
-        : Inst_MIMG(iFmt, "image_sample_c_cd_cl_o")
-    {
-        setFlag(GlobalSegment);
-    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
-
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O()
-    {
-    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
-
-    // --- description from .arch file ---
-    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives
-    // (LOD per quad).
-    void
-    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_EXP__EXP class methods ---
-
-    Inst_EXP__EXP::Inst_EXP__EXP(InFmt_EXP *iFmt)
-        : Inst_EXP(iFmt, "exp")
-    {
-    } // Inst_EXP__EXP
-
-    Inst_EXP__EXP::~Inst_EXP__EXP()
-    {
-    } // ~Inst_EXP__EXP
-
-    // --- description from .arch file ---
-    // Export through SX.
-    void
-    Inst_EXP__EXP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_UBYTE class methods ---
-
-    Inst_FLAT__FLAT_LOAD_UBYTE::Inst_FLAT__FLAT_LOAD_UBYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_ubyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_UBYTE
-
-    Inst_FLAT__FLAT_LOAD_UBYTE::~Inst_FLAT__FLAT_LOAD_UBYTE()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_UBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-        vdst.write();
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_SBYTE class methods ---
-
-    Inst_FLAT__FLAT_LOAD_SBYTE::Inst_FLAT__FLAT_LOAD_SBYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_sbyte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_SBYTE
-
-    Inst_FLAT__FLAT_LOAD_SBYTE::~Inst_FLAT__FLAT_LOAD_SBYTE()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_SBYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed byte (sign extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_USHORT class methods ---
-
-    Inst_FLAT__FLAT_LOAD_USHORT::Inst_FLAT__FLAT_LOAD_USHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_ushort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_USHORT
-
-    Inst_FLAT__FLAT_LOAD_USHORT::~Inst_FLAT__FLAT_LOAD_USHORT()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_USHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load unsigned short (zero extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
-                    gpuDynInst->d_data))[lane]);
-            }
-        }
-        vdst.write();
-    } // execute
-
-    // --- Inst_FLAT__FLAT_LOAD_SSHORT class methods ---
-
-    Inst_FLAT__FLAT_LOAD_SSHORT::Inst_FLAT__FLAT_LOAD_SSHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_sshort")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_SSHORT
-
-    Inst_FLAT__FLAT_LOAD_SSHORT::~Inst_FLAT__FLAT_LOAD_SSHORT()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_SSHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer load signed short (sign extend to VGPR destination).
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_FLAT__FLAT_LOAD_DWORD class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORD::Inst_FLAT__FLAT_LOAD_DWORD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORD
-
-    Inst_FLAT__FLAT_LOAD_DWORD::~Inst_FLAT__FLAT_LOAD_DWORD()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer load dword.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-        vdst.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_LOAD_DWORDX2 class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORDX2::Inst_FLAT__FLAT_LOAD_DWORDX2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX2
-
-    Inst_FLAT__FLAT_LOAD_DWORDX2::~Inst_FLAT__FLAT_LOAD_DWORDX2()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer load 2 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                    gpuDynInst->d_data))[lane];
-            }
-        }
-        vdst.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_LOAD_DWORDX3 class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORDX3::Inst_FLAT__FLAT_LOAD_DWORDX3(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX3
-
-    Inst_FLAT__FLAT_LOAD_DWORDX3::~Inst_FLAT__FLAT_LOAD_DWORDX3()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer load 3 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 2];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_LOAD_DWORDX4 class methods ---
-
-    Inst_FLAT__FLAT_LOAD_DWORDX4::Inst_FLAT__FLAT_LOAD_DWORDX4(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_load_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Load);
-    } // Inst_FLAT__FLAT_LOAD_DWORDX4
-
-    Inst_FLAT__FLAT_LOAD_DWORDX4::~Inst_FLAT__FLAT_LOAD_DWORDX4()
-    {
-    } // ~Inst_FLAT__FLAT_LOAD_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer load 4 dwords.
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemRead<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
-        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
-        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
-        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4];
-                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1];
-                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2];
-                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3];
-            }
-        }
-
-        vdst0.write();
-        vdst1.write();
-        vdst2.write();
-        vdst3.write();
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_BYTE class methods ---
-
-    Inst_FLAT__FLAT_STORE_BYTE::Inst_FLAT__FLAT_STORE_BYTE(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_byte")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_BYTE
-
-    Inst_FLAT__FLAT_STORE_BYTE::~Inst_FLAT__FLAT_STORE_BYTE()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_BYTE
-
-    // --- description from .arch file ---
-    // Untyped buffer store byte.
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU8 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU8*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU8>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // execute
-    // --- Inst_FLAT__FLAT_STORE_SHORT class methods ---
-
-    Inst_FLAT__FLAT_STORE_SHORT::Inst_FLAT__FLAT_STORE_SHORT(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_short")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_SHORT
-
-    Inst_FLAT__FLAT_STORE_SHORT::~Inst_FLAT__FLAT_STORE_SHORT()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_SHORT
-
-    // --- description from .arch file ---
-    // Untyped buffer store short.
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU16 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU16>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORD class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dword")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORD
-
-    Inst_FLAT__FLAT_STORE_DWORD::~Inst_FLAT__FLAT_STORE_DWORD()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORD
-
-    // --- description from .arch file ---
-    // Untyped buffer store dword.
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORDX2 class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORDX2::Inst_FLAT__FLAT_STORE_DWORDX2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx2")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX2
-
-    Inst_FLAT__FLAT_STORE_DWORDX2::~Inst_FLAT__FLAT_STORE_DWORDX2()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX2
-
-    // --- description from .arch file ---
-    // Untyped buffer store 2 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->d_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORDX3 class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORDX3::Inst_FLAT__FLAT_STORE_DWORDX3(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx3")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX3
-
-    Inst_FLAT__FLAT_STORE_DWORDX3::~Inst_FLAT__FLAT_STORE_DWORDX3()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX3
-
-    // --- description from .arch file ---
-    // Untyped buffer store 3 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
-
-        data0.read();
-        data1.read();
-        data2.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 3 + 2] = data2[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<3>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_STORE_DWORDX4 class methods ---
-
-    Inst_FLAT__FLAT_STORE_DWORDX4::Inst_FLAT__FLAT_STORE_DWORDX4(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_store_dwordx4")
-    {
-        setFlag(MemoryRef);
-        setFlag(Store);
-    } // Inst_FLAT__FLAT_STORE_DWORDX4
-
-    Inst_FLAT__FLAT_STORE_DWORDX4::~Inst_FLAT__FLAT_STORE_DWORDX4()
-    {
-    } // ~Inst_FLAT__FLAT_STORE_DWORDX4
-
-    // --- description from .arch file ---
-    // Untyped buffer store 4 dwords.
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            wf->decExpInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data0(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 data1(gpuDynInst, extData.DATA + 1);
-        ConstVecOperandU32 data2(gpuDynInst, extData.DATA + 2);
-        ConstVecOperandU32 data3(gpuDynInst, extData.DATA + 3);
-
-        data0.read();
-        data1.read();
-        data2.read();
-        data3.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4] = data0[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
-                (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initMemWrite<4>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SWAP class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP::Inst_FLAT__FLAT_ATOMIC_SWAP(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_swap")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SWAP
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP::~Inst_FLAT__FLAT_ATOMIC_SWAP()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-        ::Inst_FLAT__FLAT_ATOMIC_CMPSWAP(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_cmpswap")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0];
-    // cmp = DATA[1];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-        ConstVecOperandU32 cmp(gpuDynInst, extData.DATA + 1);
-
-        data.read();
-        cmp.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->x_data))[lane]
-                    = data[lane];
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = cmp[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_ADD class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_ADD::Inst_FLAT__FLAT_ATOMIC_ADD(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD
-
-    Inst_FLAT__FLAT_ATOMIC_ADD::~Inst_FLAT__FLAT_ATOMIC_ADD()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SUB class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SUB::Inst_FLAT__FLAT_ATOMIC_SUB(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_sub")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SUB
-
-    Inst_FLAT__FLAT_ATOMIC_SUB::~Inst_FLAT__FLAT_ATOMIC_SUB()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SUB
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_SMIN class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMIN
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN::~Inst_FLAT__FLAT_ATOMIC_SMIN()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_UMIN class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN::Inst_FLAT__FLAT_ATOMIC_UMIN(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umin")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMIN
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN::~Inst_FLAT__FLAT_ATOMIC_UMIN()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_SMAX class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX::Inst_FLAT__FLAT_ATOMIC_SMAX(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMAX
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX::~Inst_FLAT__FLAT_ATOMIC_SMAX()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_UMAX class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX::Inst_FLAT__FLAT_ATOMIC_UMAX(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umax")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMAX
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX::~Inst_FLAT__FLAT_ATOMIC_UMAX()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_AND class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_AND::Inst_FLAT__FLAT_ATOMIC_AND(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_and")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_AND
-
-    Inst_FLAT__FLAT_ATOMIC_AND::~Inst_FLAT__FLAT_ATOMIC_AND()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_AND
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_OR class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_OR::Inst_FLAT__FLAT_ATOMIC_OR(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_or")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_OR
-
-    Inst_FLAT__FLAT_ATOMIC_OR::~Inst_FLAT__FLAT_ATOMIC_OR()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_OR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            wf->decLGKMInstsIssued();
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU32 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU32>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU32 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-
-    // --- Inst_FLAT__FLAT_ATOMIC_XOR class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_XOR::Inst_FLAT__FLAT_ATOMIC_XOR(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_xor")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_XOR
-
-    Inst_FLAT__FLAT_ATOMIC_XOR::~Inst_FLAT__FLAT_ATOMIC_XOR()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_XOR
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA;
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_INC class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_INC::Inst_FLAT__FLAT_ATOMIC_INC(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_inc")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_INC
-
-    Inst_FLAT__FLAT_ATOMIC_INC::~Inst_FLAT__FLAT_ATOMIC_INC()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_INC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_DEC class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_dec")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_DEC
-
-    Inst_FLAT__FLAT_ATOMIC_DEC::~Inst_FLAT__FLAT_ATOMIC_DEC()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_DEC
-
-    // --- description from .arch file ---
-    // 32b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
-    // (unsigned compare); RETURN_DATA = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_SWAP_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_swap_x2")
-    {
-        setFlag(AtomicExch);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SWAP_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::~Inst_FLAT__FLAT_ATOMIC_SWAP_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_cmpswap_x2")
-    {
-        setFlag(AtomicCAS);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
-
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // src = DATA[0:1];
-    // cmp = DATA[2:3];
-    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-        ConstVecOperandU64 cmp(gpuDynInst, extData.DATA + 2);
-
-        data.read();
-        cmp.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->x_data))[lane]
-                    = data[lane];
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
-                    = cmp[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_ADD_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::Inst_FLAT__FLAT_ATOMIC_ADD_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_add_x2")
-    {
-        setFlag(AtomicAdd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_ADD_X2
-
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::~Inst_FLAT__FLAT_ATOMIC_ADD_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_ADD_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] += DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        Wavefront *wf = gpuDynInst->wavefront();
-
-        if (gpuDynInst->exec_mask.none()) {
-            wf->decVMemInstsIssued();
-            if (isFlat()) {
-                wf->decLGKMInstsIssued();
-            }
-            return;
-        }
-
-        gpuDynInst->execUnitId = wf->execUnitId;
-        gpuDynInst->latency.init(gpuDynInst->computeUnit());
-        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
-
-        ConstVecOperandU64 data(gpuDynInst, extData.DATA);
-
-        data.read();
-
-        calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
-
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (gpuDynInst->exec_mask[lane]) {
-                (reinterpret_cast<VecElemU64*>(gpuDynInst->a_data))[lane]
-                    = data[lane];
-            }
-        }
-
-        issueRequestHelper(gpuDynInst);
-    } // execute
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::initiateAcc(GPUDynInstPtr gpuDynInst)
-    {
-        initAtomicAccess<VecElemU64>(gpuDynInst);
-    } // initiateAcc
-
-    void
-    Inst_FLAT__FLAT_ATOMIC_ADD_X2::completeAcc(GPUDynInstPtr gpuDynInst)
-    {
-        if (isAtomicRet()) {
-            VecOperandU64 vdst(gpuDynInst, extData.VDST);
-
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (gpuDynInst->exec_mask[lane]) {
-                    vdst[lane] = (reinterpret_cast<VecElemU64*>(
-                        gpuDynInst->d_data))[lane];
-                }
-            }
-
-            vdst.write();
-        }
-    } // completeAcc
-    // --- Inst_FLAT__FLAT_ATOMIC_SUB_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::Inst_FLAT__FLAT_ATOMIC_SUB_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_sub_x2")
-    {
-        setFlag(AtomicSub);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SUB_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::~Inst_FLAT__FLAT_ATOMIC_SUB_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SUB_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_SMIN_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMIN_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::~Inst_FLAT__FLAT_ATOMIC_SMIN_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_UMIN_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::Inst_FLAT__FLAT_ATOMIC_UMIN_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umin_x2")
-    {
-        setFlag(AtomicMin);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMIN_X2
-
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::~Inst_FLAT__FLAT_ATOMIC_UMIN_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMIN_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_SMAX_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::Inst_FLAT__FLAT_ATOMIC_SMAX_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_smax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_SMAX_X2
-
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::~Inst_FLAT__FLAT_ATOMIC_SMAX_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_SMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_UMAX_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::Inst_FLAT__FLAT_ATOMIC_UMAX_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_umax_x2")
-    {
-        setFlag(AtomicMax);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_UMAX_X2
-
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::~Inst_FLAT__FLAT_ATOMIC_UMAX_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_UMAX_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_AND_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::Inst_FLAT__FLAT_ATOMIC_AND_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_and_x2")
-    {
-        setFlag(AtomicAnd);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_AND_X2
-
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::~Inst_FLAT__FLAT_ATOMIC_AND_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_AND_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] &= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_OR_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::Inst_FLAT__FLAT_ATOMIC_OR_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_or_x2")
-    {
-        setFlag(AtomicOr);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_OR_X2
-
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::~Inst_FLAT__FLAT_ATOMIC_OR_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_OR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] |= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_XOR_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::Inst_FLAT__FLAT_ATOMIC_XOR_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_xor_x2")
-    {
-        setFlag(AtomicXor);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_XOR_X2
-
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::~Inst_FLAT__FLAT_ATOMIC_XOR_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_XOR_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] ^= DATA[0:1];
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_INC_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::Inst_FLAT__FLAT_ATOMIC_INC_X2(
-          InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_inc_x2")
-    {
-        setFlag(AtomicInc);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_INC_X2
-
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::~Inst_FLAT__FLAT_ATOMIC_INC_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_INC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-    // --- Inst_FLAT__FLAT_ATOMIC_DEC_X2 class methods ---
-
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2(
-        InFmt_FLAT *iFmt)
-        : Inst_FLAT(iFmt, "flat_atomic_dec_x2")
-    {
-        setFlag(AtomicDec);
-        if (instData.GLC) {
-            setFlag(AtomicReturn);
-        } else {
-            setFlag(AtomicNoReturn);
-        }
-        setFlag(MemoryRef);
-    } // Inst_FLAT__FLAT_ATOMIC_DEC_X2
-
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::~Inst_FLAT__FLAT_ATOMIC_DEC_X2()
-    {
-    } // ~Inst_FLAT__FLAT_ATOMIC_DEC_X2
-
-    // --- description from .arch file ---
-    // 64b:
-    // tmp = MEM[ADDR];
-    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
-    // (unsigned compare);
-    // RETURN_DATA[0:1] = tmp.
-    void
-    Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
-    {
-        panicUnimplemented();
-    } // execute
-} // namespace VegaISA
-} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index ca349c365f..a979c1e492 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -32,6 +32,10 @@
 #ifndef __ARCH_VEGA_INSTS_INSTRUCTIONS_HH__
 #define __ARCH_VEGA_INSTS_INSTRUCTIONS_HH__
 
+#include <cstddef>
+#include <type_traits>
+
+#include "arch/amdgpu/common/dtype/mxfp_types.hh"
 #include "arch/amdgpu/vega/gpu_decoder.hh"
 #include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
 #include "arch/amdgpu/vega/insts/op_encodings.hh"
@@ -8098,6 +8102,74 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP2__V_SUBREV_U32
 
+    class Inst_VOP2__V_FMAC_F32 : public Inst_VOP2
+    {
+      public:
+        Inst_VOP2__V_FMAC_F32(InFmt_VOP2*);
+        ~Inst_VOP2__V_FMAC_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP2__V_FMAC_F32
+
+    class Inst_VOP2__V_XNOR_B32 : public Inst_VOP2
+    {
+      public:
+        Inst_VOP2__V_XNOR_B32(InFmt_VOP2*);
+        ~Inst_VOP2__V_XNOR_B32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP2__V_XNOR_B32
+
     class Inst_VOP1__V_NOP : public Inst_VOP1
     {
       public:
@@ -9818,6 +9890,38 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP1__V_CLREXCP
 
+    class Inst_VOP1__V_MOV_B64 : public Inst_VOP1
+    {
+      public:
+        Inst_VOP1__V_MOV_B64(InFmt_VOP1*);
+        ~Inst_VOP1__V_MOV_B64();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 1; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src
+                return 8;
+              case 1: //vdst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP1__V_MOV_B64
+
     class Inst_VOP1__V_CVT_F16_U16 : public Inst_VOP1
     {
       public:
@@ -10458,6 +10562,38 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP1__V_LOG_LEGACY_F32
 
+    class Inst_VOP1__V_ACCVGPR_MOV_B32 : public Inst_VOP1
+    {
+      public:
+        Inst_VOP1__V_ACCVGPR_MOV_B32(InFmt_VOP1*);
+        ~Inst_VOP1__V_ACCVGPR_MOV_B32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 1; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src
+                return 4;
+              case 1: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP1__V_ACCVGPR_MOV_B32
+
     class Inst_VOPC__V_CMP_CLASS_F32 : public Inst_VOPC
     {
       public:
@@ -25814,6 +25950,40 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP3__V_SUBREV_U32
 
+    class Inst_VOP3__V_FMAC_F32 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_FMAC_F32(InFmt_VOP3A*);
+        ~Inst_VOP3__V_FMAC_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_FMAC_F32
+
     class Inst_VOP3__V_NOP : public Inst_VOP3A
     {
       public:
@@ -30124,6 +30294,42 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP3__V_DIV_FIXUP_F16
 
+    class Inst_VOP3__V_LSHL_ADD_U64 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A*);
+        ~Inst_VOP3__V_LSHL_ADD_U64();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 8;
+              case 1: //src_1
+                return 4;
+              case 2: //src_2
+                return 8;
+              case 3: //vdst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_LSHL_ADD_U64
+
     class Inst_VOP3__V_CVT_PKACCUM_U8_F32 : public Inst_VOP3A
     {
       public:
@@ -32986,6 +33192,74 @@ namespace VegaISA
         void completeAcc(GPUDynInstPtr) override;
     }; // Inst_DS__DS_READ_U16
 
+    class Inst_DS__DS_READ_U16_D16 : public Inst_DS
+    {
+      public:
+        Inst_DS__DS_READ_U16_D16(InFmt_DS*);
+        ~Inst_DS__DS_READ_U16_D16();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 1; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_a
+                return 4;
+              case 1: //vgpr_rtn
+                return 2;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_DS__DS_READ_U16_D16
+
+    class Inst_DS__DS_READ_U16_D16_HI : public Inst_DS
+    {
+      public:
+        Inst_DS__DS_READ_U16_D16_HI(InFmt_DS*);
+        ~Inst_DS__DS_READ_U16_D16_HI();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 1; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_a
+                return 4;
+              case 1: //vgpr_rtn
+                return 2;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_DS__DS_READ_U16_D16_HI
+
     class Inst_DS__DS_SWIZZLE_B32 : public Inst_DS
     {
       public:
@@ -36670,6 +36944,82 @@ namespace VegaISA
         void completeAcc(GPUDynInstPtr) override;
     }; // Inst_MUBUF__BUFFER_LOAD_SSHORT
 
+    class Inst_MUBUF__BUFFER_LOAD_SHORT_D16 : public Inst_MUBUF
+    {
+      public:
+        Inst_MUBUF__BUFFER_LOAD_SHORT_D16(InFmt_MUBUF*);
+        ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_a
+                return 8;
+              case 1: //sgpr_r
+                return 16;
+              case 2: //sgpr_o
+                return 4;
+              case 3: //vgpr_d
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_MUBUF__BUFFER_LOAD_SHORT_D16
+
+    class Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI : public Inst_MUBUF
+    {
+      public:
+        Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(InFmt_MUBUF*);
+        ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_a
+                return 8;
+              case 1: //sgpr_r
+                return 16;
+              case 2: //sgpr_o
+                return 4;
+              case 3: //vgpr_d
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
+
     class Inst_MUBUF__BUFFER_LOAD_DWORD : public Inst_MUBUF
     {
       public:
@@ -42280,6 +42630,43 @@ namespace VegaISA
         void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_STORE_SHORT
 
+    class Inst_FLAT__FLAT_STORE_SHORT_D16_HI : public Inst_FLAT
+    {
+      public:
+        Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT*);
+        ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 0; }
+        int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_addr
+                return vgprIsOffset() ? 4 : 8;
+              case 1: //vgpr_src
+                return 2;
+              case 2: //saddr
+                assert(!isFlat());
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
+
     class Inst_FLAT__FLAT_STORE_DWORD : public Inst_FLAT
     {
       public:
@@ -42580,6 +42967,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_SUB
 
     class Inst_FLAT__FLAT_ATOMIC_SMIN : public Inst_FLAT
@@ -42656,6 +43045,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_UMIN
 
     class Inst_FLAT__FLAT_ATOMIC_SMAX : public Inst_FLAT
@@ -42732,6 +43123,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_UMAX
 
     class Inst_FLAT__FLAT_ATOMIC_AND : public Inst_FLAT
@@ -42769,6 +43162,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_AND
 
     class Inst_FLAT__FLAT_ATOMIC_OR : public Inst_FLAT
@@ -42845,6 +43240,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_XOR
 
     class Inst_FLAT__FLAT_ATOMIC_INC : public Inst_FLAT
@@ -42882,6 +43279,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_INC
 
     class Inst_FLAT__FLAT_ATOMIC_DEC : public Inst_FLAT
@@ -42919,6 +43318,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_DEC
 
     class Inst_FLAT__FLAT_ATOMIC_SWAP_X2 : public Inst_FLAT
@@ -42956,6 +43357,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_SWAP_X2
 
     class Inst_FLAT__FLAT_ATOMIC_CMPSWAP_X2 : public Inst_FLAT
@@ -43071,6 +43474,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_SUB_X2
 
     class Inst_FLAT__FLAT_ATOMIC_SMIN_X2 : public Inst_FLAT
@@ -43108,6 +43513,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_SMIN_X2
 
     class Inst_FLAT__FLAT_ATOMIC_UMIN_X2 : public Inst_FLAT
@@ -43145,6 +43552,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_UMIN_X2
 
     class Inst_FLAT__FLAT_ATOMIC_SMAX_X2 : public Inst_FLAT
@@ -43182,6 +43591,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_SMAX_X2
 
     class Inst_FLAT__FLAT_ATOMIC_UMAX_X2 : public Inst_FLAT
@@ -43219,6 +43630,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_UMAX_X2
 
     class Inst_FLAT__FLAT_ATOMIC_AND_X2 : public Inst_FLAT
@@ -43256,6 +43669,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_AND_X2
 
     class Inst_FLAT__FLAT_ATOMIC_OR_X2 : public Inst_FLAT
@@ -43293,6 +43708,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_OR_X2
 
     class Inst_FLAT__FLAT_ATOMIC_XOR_X2 : public Inst_FLAT
@@ -43330,6 +43747,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_XOR_X2
 
     class Inst_FLAT__FLAT_ATOMIC_INC_X2 : public Inst_FLAT
@@ -43367,6 +43786,8 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_INC_X2
 
     class Inst_FLAT__FLAT_ATOMIC_DEC_X2 : public Inst_FLAT
@@ -43404,7 +43825,1068 @@ namespace VegaISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_FLAT__FLAT_ATOMIC_DEC_X2
+
+    class Inst_FLAT__FLAT_ATOMIC_ADD_F32 : public Inst_FLAT
+    {
+      public:
+        Inst_FLAT__FLAT_ATOMIC_ADD_F32(InFmt_FLAT*);
+        ~Inst_FLAT__FLAT_ATOMIC_ADD_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_addr
+                return vgprIsOffset() ? 4 : 8;
+              case 1: //vgpr_src
+                return 4;
+              case 2: //vgpr_dst or saddr
+                return isFlat() ? 4 : 8;
+              case 3: //vgpr_dst
+                assert(!isFlat());
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_FLAT__FLAT_ATOMIC_ADD_F32
+
+    class Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16 : public Inst_FLAT
+    {
+      public:
+        Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16(InFmt_FLAT*);
+        ~Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_addr
+                return vgprIsOffset() ? 4 : 8;
+              case 1: //vgpr_src
+                return 4;
+              case 2: //vgpr_dst or saddr
+                return isFlat() ? 4 : 8;
+              case 3: //vgpr_dst
+                assert(!isFlat());
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_FLAT__FLAT_ATOMIC_PK_ADD_F16
+
+    class Inst_FLAT__FLAT_ATOMIC_ADD_F64 : public Inst_FLAT
+    {
+      public:
+        Inst_FLAT__FLAT_ATOMIC_ADD_F64(InFmt_FLAT*);
+        ~Inst_FLAT__FLAT_ATOMIC_ADD_F64();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_addr
+                return vgprIsOffset() ? 4 : 8;
+              case 1: //vgpr_src
+                return 8;
+              case 2: //vgpr_dst or saddr
+                return isFlat() ? 8 : 8;
+              case 3: //vgpr_dst
+                assert(!isFlat());
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_FLAT__FLAT_ATOMIC_ADD_F64
+
+    class Inst_FLAT__FLAT_ATOMIC_MIN_F64 : public Inst_FLAT
+    {
+      public:
+        Inst_FLAT__FLAT_ATOMIC_MIN_F64(InFmt_FLAT*);
+        ~Inst_FLAT__FLAT_ATOMIC_MIN_F64();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_addr
+                return vgprIsOffset() ? 4 : 8;
+              case 1: //vgpr_src
+                return 8;
+              case 2: //vgpr_dst or saddr
+                return isFlat() ? 8 : 8;
+              case 3: //vgpr_dst
+                assert(!isFlat());
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_FLAT__FLAT_ATOMIC_MIN_F64
+
+    class Inst_FLAT__FLAT_ATOMIC_MAX_F64 : public Inst_FLAT
+    {
+      public:
+        Inst_FLAT__FLAT_ATOMIC_MAX_F64(InFmt_FLAT*);
+        ~Inst_FLAT__FLAT_ATOMIC_MAX_F64();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //vgpr_addr
+                return vgprIsOffset() ? 4 : 8;
+              case 1: //vgpr_src
+                return 8;
+              case 2: //vgpr_dst or saddr
+                return isFlat() ? 8 : 8;
+              case 3: //vgpr_dst
+                assert(!isFlat());
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
+    }; // Inst_FLAT__FLAT_ATOMIC_MAX_F64
+
+    class Inst_VOP3P__V_PK_FMA_F32 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P*);
+        ~Inst_VOP3P__V_PK_FMA_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 8;
+              case 1: // src1
+                return 8;
+              case 2: // src2
+                return 8;
+              case 3: // dst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3P__V_PK_FMA_F32
+
+    class Inst_VOP3P__V_PK_MUL_F32 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P*);
+        ~Inst_VOP3P__V_PK_MUL_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 8;
+              case 1: // src1
+                return 8;
+              case 2: // dst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3P__V_PK_MUL_F32
+
+    class Inst_VOP3P__V_PK_ADD_F32 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P*);
+        ~Inst_VOP3P__V_PK_ADD_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 8;
+              case 1: // src1
+                return 8;
+              case 2: // dst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3P__V_PK_ADD_F32
+
+    class Inst_VOP3P__V_PK_MOV_B32 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P*);
+        ~Inst_VOP3P__V_PK_MOV_B32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 8;
+              case 1: // src1
+                return 8;
+              case 2: // dst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3P__V_PK_MOV_B32
+
+    template <const int _delta, const int M, const int N, const int K,
+              const int B, typename T1, typename T2, const char **MNEMONIC>
+    class Inst_VOP3P_MAI__V_MFMA : public Inst_VOP3P_MAI
+    {
+
+    private:
+      static constexpr int gprs_a = M * K * B / 64, gprs_b = K * N * B / 64,
+                           gprs_c_d = M * N * B / 64;
+
+    public:
+      Inst_VOP3P_MAI__V_MFMA(InFmt_VOP3P_MAI *iFmt)
+          : Inst_VOP3P_MAI(iFmt, *MNEMONIC)
+      {
+        setFlag(ALU);
+        setFlag(MFMA);
+        if (_delta == 2) {
+            setFlag(F64);
+        } else if (_delta == 1) {
+            setFlag(F32);
+        }
+      }
+      ~Inst_VOP3P_MAI__V_MFMA() {}
+
+      int getNumOperands() override {
+        return numDstRegOperands() + numSrcRegOperands();
+      } // getNumOperands
+
+      int numDstRegOperands() override { return 1; }
+      int numSrcRegOperands() override { return 3; }
+
+      int getOperandSize(int opIdx) override {
+        switch (opIdx) {
+        case 0: // src0 "A"
+          return 4*gprs_a;
+        case 1: // src1 "B"
+          return 4*gprs_b;
+        case 2: // src2 "C"
+          return 4*gprs_c_d;
+        case 3: // dst
+          return 4*gprs_c_d;
+        default:
+          fatal("op idx %i out of bounds\n", opIdx);
+          return -1;
+        }
+      } // getOperandSize
+
+    void
+    execute(GPUDynInstPtr gpuDynInst) override
+    {
+        int acc_cd_off = 0;
+        int acc_a_off = 0;
+        int acc_b_off = 0;
+        if (instData.ACC_CD) {
+            acc_cd_off = gpuDynInst->wavefront()->accumOffset;
+        }
+        if (extData.ACC) {
+            int tmp_acc = extData.ACC;
+            if (tmp_acc & 0x1) {
+                acc_a_off = gpuDynInst->wavefront()->accumOffset;
+            }
+            if (tmp_acc & 0x2) {
+                acc_b_off = gpuDynInst->wavefront()->accumOffset;
+            }
+        }
+
+        alignas(T1) std::byte _src0[sizeof(T1) * gprs_a];
+        alignas(T1) std::byte _src1[sizeof(T1) * gprs_b];
+        alignas(T1) std::byte _src2[sizeof(T1) * gprs_c_d];
+        alignas(T2) std::byte _vdst[sizeof(T2) * gprs_c_d];
+        T1 *src0 = std::launder(reinterpret_cast<T1*>(&_src0));
+        T1 *src1 = std::launder(reinterpret_cast<T1*>(&_src1));
+        T1 *src2 = std::launder(reinterpret_cast<T1*>(&_src2));
+        T2 *vdst = std::launder(reinterpret_cast<T2*>(&_vdst));
+
+        // Handling of src2 is a bit tricky. The operator[] overload cannot
+        // be used for dword count > 2, and the dword count here is 4. Usually
+        // src2 is a VGPR/AccGPR, but it might also be constant. In order to
+        // use operator[] and handle constants, check for VGPR here and set
+        // a delta for each of the src2 GPRs.
+        int delta = isVectorReg(extData.SRC0) ? _delta : 0;
+        for (int i = 0; i < gprs_a; i++) {
+            new (&src0[i]) T1(gpuDynInst, extData.SRC0+acc_a_off+i*delta);
+            src0[i].readSrc();
+        }
+
+        delta = isVectorReg(extData.SRC1) ? _delta : 0;
+        for (int i = 0; i < gprs_b; i++) {
+            new (&src1[i]) T1(gpuDynInst, extData.SRC1+acc_b_off+i*delta);
+            src1[i].readSrc();
+        }
+
+        delta = isVectorReg(extData.SRC2) ? _delta : 0;
+        for (int i = 0; i < gprs_c_d; i++) {
+            new (&src2[i]) T1(gpuDynInst, extData.SRC2+acc_cd_off+i*delta);
+            src2[i].readSrc();
+        }
+
+        for (int i = 0; i < gprs_c_d; i++) {
+            new (&vdst[i]) T2(gpuDynInst, instData.VDST+acc_cd_off+i*_delta);
+        }
+
+        // These values and meanings are described in the MI300 ISA manual:
+        //
+        // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
+        //    instruction-set-architectures/
+        //    amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
+        //
+        // in section 7.1.4.2. In theory, only the M, N, K, and H values change
+        // for each MFMA instruction.
+
+        // Output layout
+        constexpr int H = _delta == 2 ? 1 : 4;
+
+        // This replaces `constexpr int B_I = std::ceil(64.0f / (N * M / H));`
+        // which failed clang compiler tests as it's not a constant expression.
+        constexpr float B_I_f = 64.0f / (N * M / H);
+        constexpr int B_I =
+            (static_cast<float>(static_cast<int>(B_I_f)) == B_I_f)
+            ? static_cast<int32_t>(B_I_f)
+            : static_cast<int32_t>(B_I_f) + ((B_I_f > 0) ? 1 : 0);
+
+        constexpr int M_I = (64 / B_I) / N;
+        constexpr int G = M / (H * M_I);
+
+        float result[M][N];
+
+        // Input layout
+        constexpr int K_L = K / (64 / (M * B));
+
+        for (int block = 0; block < B; block++) {
+            // Load src2 into result. src2 is row major
+            for (int i = 0; i < M; ++i) {
+                for (int j = 0; j < N; ++j) {
+                    int item = (i % H) + H * (i/(H*M_I) + G * (block / B_I));
+                    int lane = j + N * ((i / H) % M_I + M_I * (block % B_I));
+
+                    result[i][j] = src2[item][lane];
+                }
+            }
+
+            // Compute new result
+            for (int i = 0; i < M; ++i) {
+                for (int j = 0; j < N; ++j) {
+                    for (int k = 0; k < K; ++k) {
+                        // src0 is column major, src1 is row major
+                        int lane_A = i + M * (block + B * (k / K_L));
+                        int lane_B = j + N * (block + B * (k / K_L));
+                        int item = k % K_L;
+                        result[i][j] +=
+                          src0[item][lane_A] * src1[item][lane_B];
+                    }
+                }
+            }
+
+            for (int i = 0; i < M; ++i) {
+                for (int j = 0; j < N; ++j) {
+                    int item = (i % H) + H * (i/(H*M_I) + G * (block / B_I));
+                    int lane = j + N * ((i / H) % M_I + M_I * (block % B_I));
+
+                    vdst[item][lane] = result[i][j];
+                }
+            }
+        }
+
+        for (int i = 0; i < gprs_c_d; ++i) {
+            vdst[i].write();
+        }
+
+        for (int i = 0; i < gprs_a; i++) {
+            std::destroy_at(&src0[i]);
+        }
+        for (int i = 0; i < gprs_b; i++) {
+            std::destroy_at(&src1[i]);
+        }
+        for (int i = 0; i < gprs_c_d; i++) {
+            std::destroy_at(&src2[i]);
+        }
+        for (int i = 0; i < gprs_c_d; i++) {
+            std::destroy_at(&vdst[i]);
+        }
+    } // execute
+    };
+
+    static const char *MNEM__V_MFMA_F32_4X4X1_16B_F32 =
+        "v_mfma_f32_4x4x1_16b_f32";
+    using Inst_VOP3P_MAI__V_MFMA_F32_4X4X1_16B_F32 =
+        Inst_VOP3P_MAI__V_MFMA<1, 4, 4, 1, 16, ConstVecOperandF32,
+                               VecOperandF32, &MNEM__V_MFMA_F32_4X4X1_16B_F32>;
+
+    static const char *MNEM__V_MFMA_F32_32X32X1_2B_F32 =
+        "v_mfma_f32_32x32x1_2b_f32";
+    using Inst_VOP3P_MAI__V_MFMA_F32_32X32X1_2B_F32 =
+        Inst_VOP3P_MAI__V_MFMA<1, 32, 32, 1, 2, ConstVecOperandF32,
+                               VecOperandF32,
+                               &MNEM__V_MFMA_F32_32X32X1_2B_F32>;
+
+    static const char *MNEM__V_MFMA_F32_32X32X2_F32 =
+        "v_mfma_f32_32x32x2_f32";
+    using Inst_VOP3P_MAI__V_MFMA_F32_32X32X2_F32 =
+        Inst_VOP3P_MAI__V_MFMA<1, 32, 32, 2, 1, ConstVecOperandF32,
+                               VecOperandF32, &MNEM__V_MFMA_F32_32X32X2_F32>;
+
+    static const char *MNEM__V_MFMA_F32_16X16X4_F32 =
+        "v_mfma_f32_16x16x4_f32";
+    using Inst_VOP3P_MAI__V_MFMA_F32_16X16X4_F32 =
+        Inst_VOP3P_MAI__V_MFMA<1, 16, 16, 4, 1, ConstVecOperandF32,
+                               VecOperandF32, &MNEM__V_MFMA_F32_16X16X4_F32>;
+
+    static const char *MNEM__V_MFMA_F32_16X16X1_4B_F32 =
+        "v_mfma_f32_16x16x1_4b_f32";
+    using Inst_VOP3P_MAI__V_MFMA_F32_16X16X1_4B_F32 =
+        Inst_VOP3P_MAI__V_MFMA<1, 16, 16, 1, 4, ConstVecOperandF32,
+                               VecOperandF32,
+                               &MNEM__V_MFMA_F32_16X16X1_4B_F32>;
+
+    static const char *MNEM__V_MFMA_F64_4X4X4_4B_F64 =
+        "v_mfma_f64_4x4x4_4b_f64";
+    using Inst_VOP3P_MAI__V_MFMA_F64_4X4X4_4B_F64 =
+        Inst_VOP3P_MAI__V_MFMA<2, 4, 4, 4, 4, ConstVecOperandF64,
+                               VecOperandF64, &MNEM__V_MFMA_F64_4X4X4_4B_F64>;
+
+    static const char *MNEM__V_MFMA_F64_16X16X4_F64 =
+        "v_mfma_f64_16x16x4_f64";
+    using Inst_VOP3P_MAI__V_MFMA_F64_16X16X4_F64 =
+        Inst_VOP3P_MAI__V_MFMA<2, 16, 16, 4, 1, ConstVecOperandF64,
+                               VecOperandF64, &MNEM__V_MFMA_F64_16X16X4_F64>;
+
+
+    template <const int M, const int N, const int K,
+              const int B, typename MXFPT, const char **MNEMONIC>
+    class Inst_VOP3P_MAI__V_MFMA_MXFP : public Inst_VOP3P_MAI
+    {
+
+    private:
+      // Scale GPRs needed by elements / GPR (gpr_ratio)
+      static constexpr int gpr_ratio = 32 / MXFPT::size();
+      static constexpr int gprs_a = M * K * B / (64 * gpr_ratio);
+      static constexpr int gprs_b = K * N * B / (64 * gpr_ratio);
+
+      // Always F32 which has an effective gpr_ratio of 1
+      static constexpr int gprs_c_d = M * N * B / 64;
+
+    public:
+      Inst_VOP3P_MAI__V_MFMA_MXFP(InFmt_VOP3P_MAI *iFmt)
+          : Inst_VOP3P_MAI(iFmt, *MNEMONIC)
+      {
+        setFlag(ALU);
+        setFlag(MFMA);
+        if (MXFPT::size() == 16) {
+            setFlag(F16);
+        }
+      }
+      ~Inst_VOP3P_MAI__V_MFMA_MXFP() {}
+
+      int getNumOperands() override {
+        return numDstRegOperands() + numSrcRegOperands();
+      } // getNumOperands
+
+      int numDstRegOperands() override { return 1; }
+      int numSrcRegOperands() override { return 3; }
+
+      int getOperandSize(int opIdx) override {
+        switch (opIdx) {
+        case 0: // src0 "A"
+          return 4*gprs_a;
+        case 1: // src1 "B"
+          return 4*gprs_b;
+        case 2: // src2 "C"
+          return 4*gprs_c_d;
+        case 3: // dst
+          return 4*gprs_c_d;
+        default:
+          fatal("op idx %i out of bounds\n", opIdx);
+          return -1;
+        }
+      } // getOperandSize
+
+    void
+    execute(GPUDynInstPtr gpuDynInst) override
+    {
+        int acc_cd_off = 0;
+        int acc_a_off = 0;
+        int acc_b_off = 0;
+        if (instData.ACC_CD) {
+            acc_cd_off = gpuDynInst->wavefront()->accumOffset;
+        }
+        if (extData.ACC) {
+            int tmp_acc = extData.ACC;
+            if (tmp_acc & 0x1) {
+                acc_a_off = gpuDynInst->wavefront()->accumOffset;
+            }
+            if (tmp_acc & 0x2) {
+                acc_b_off = gpuDynInst->wavefront()->accumOffset;
+            }
+        }
+
+        // Read the MXFP types as U32 - Consider this "untyped."
+        // A ConstVecOperand needs to be used for src2 as it could be an
+        // inline constant. The Const version provides an operator[] overload
+        // to read inline constants to each lane. The non-const type of src2
+        // should be used for vdst to make it writeable.
+        using T1 = ConstVecOperandU32;
+        using T2 = ConstVecOperandF32;
+        using T3 = VecOperandF32;
+
+        alignas(T1) std::byte _src0[sizeof(T1) * gprs_a];
+        alignas(T1) std::byte _src1[sizeof(T1) * gprs_b];
+        alignas(T2) std::byte _src2[sizeof(T2) * gprs_c_d];
+        alignas(T3) std::byte _vdst[sizeof(T3) * gprs_c_d];
+        T1 *src0 = std::launder(reinterpret_cast<T1*>(&_src0));
+        T1 *src1 = std::launder(reinterpret_cast<T1*>(&_src1));
+        T2 *src2 = std::launder(reinterpret_cast<T2*>(&_src2));
+        T3 *vdst = std::launder(reinterpret_cast<T3*>(&_vdst));
+
+        // Handling of src2 is a bit tricky. The operator[] overload cannot
+        // be used for dword count > 2, and the dword count here is 4. Usually
+        // src2 is a VGPR/AccGPR, but it might also be constant. In order to
+        // use operator[] and handle constants, check for VGPR here and set
+        // a delta for each of the src2 GPRs.
+
+        int delta = isVectorReg(extData.SRC0) ? 1 : 0;
+        for (int i = 0; i < gprs_a; i++) {
+            new (&src0[i]) T1(gpuDynInst, extData.SRC0+acc_a_off+i*delta);
+            src0[i].readSrc();
+        }
+
+        delta = isVectorReg(extData.SRC1) ? 1 : 0;
+        for (int i = 0; i < gprs_b; i++) {
+            new (&src1[i]) T1(gpuDynInst, extData.SRC1+acc_b_off+i*delta);
+            src1[i].readSrc();
+        }
+
+        delta = isVectorReg(extData.SRC2) ? 1 : 0;
+        for (int i = 0; i < gprs_c_d; i++) {
+            new (&src2[i]) T2(gpuDynInst, extData.SRC2+acc_cd_off+i*delta);
+            src2[i].readSrc();
+        }
+
+        for (int i = 0; i < gprs_c_d; i++) {
+            new (&vdst[i]) T3(gpuDynInst, instData.VDST+acc_cd_off+i);
+        }
+
+        // These values and meanings are described in the MI300 ISA manual:
+        //
+        // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
+        //    instruction-set-architectures/
+        //    amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
+        //
+        // in section 7.1.4.2. In theory, only the M, N, K, and H values change
+        // for each MFMA instruction.
+
+        // Output layout
+        constexpr int H = 4;
+
+        // This replaces `constexpr int B_I = std::ceil(64.0f / (N * M / H));`
+        // which failed clang compiler tests as it's not a constant expression.
+        constexpr float B_I_f = 64.0f / (N * M / H);
+        constexpr int B_I =
+            (static_cast<float>(static_cast<int>(B_I_f)) == B_I_f)
+            ? static_cast<int32_t>(B_I_f)
+            : static_cast<int32_t>(B_I_f) + ((B_I_f > 0) ? 1 : 0);
+        constexpr int M_I = (64 / B_I) / N;
+        constexpr int G = M / (H * M_I);
+
+        float result[M][N];
+
+        // Input layout
+        constexpr int K_L = K / (64 / (M * B));
+
+        for (int block = 0; block < B; block++) {
+            // Load src2 into result. src2 is row major
+            for (int i = 0; i < M; ++i) {
+                for (int j = 0; j < N; ++j) {
+                    int item = (i % H) + H * (i/(H*M_I) + G * (block / B_I));
+                    int lane = j + N * ((i / H) % M_I + M_I * (block % B_I));
+
+                    result[i][j] = src2[item][lane];
+                }
+            }
+
+            // Compute new result
+            for (int i = 0; i < M; ++i) {
+                for (int j = 0; j < N; ++j) {
+                    for (int k = 0; k < K; ++k) {
+                        // src0 is column major, src1 is row major
+                        int lane_A = i + M * (block + B * (k / K_L));
+                        int lane_B = j + N * (block + B * (k / K_L));
+                        int item = k % K_L;
+
+                        PackedReg<K_L * MXFPT::size(), MXFPT::size()> A_elems;
+                        PackedReg<K_L * MXFPT::size(), MXFPT::size()> B_elems;
+
+                        for (int i = 0; i < gprs_a; ++i) {
+                            A_elems.setDword(i, src0[i][lane_A]);
+                        }
+                        for (int i = 0; i < gprs_b; ++i) {
+                            B_elems.setDword(i, src1[i][lane_B]);
+                        }
+
+                        MXFPT item_A(A_elems.getElem(item));
+                        MXFPT item_B(B_elems.getElem(item));
+
+                        result[i][j] += item_A * item_B;
+                    }
+                }
+            }
+
+            for (int i = 0; i < M; ++i) {
+                for (int j = 0; j < N; ++j) {
+                    int item = (i % H) + H * (i/(H*M_I) + G * (block / B_I));
+                    int lane = j + N * ((i / H) % M_I + M_I * (block % B_I));
+
+                    vdst[item][lane] = result[i][j];
+                }
+            }
+        }
+
+        for (int i = 0; i < gprs_c_d; ++i) {
+            vdst[i].write();
+        }
+
+        for (int i = 0; i < gprs_a; i++) {
+            std::destroy_at(&src0[i]);
+        }
+        for (int i = 0; i < gprs_b; i++) {
+            std::destroy_at(&src1[i]);
+        }
+        for (int i = 0; i < gprs_c_d; i++) {
+            std::destroy_at(&src2[i]);
+        }
+        for (int i = 0; i < gprs_c_d; i++) {
+            std::destroy_at(&vdst[i]);
+        }
+    } // execute
+    };
+
+
+    static const char *MNEM__V_MFMA_F32_16X16X16_F16 =
+        "v_mfma_f32_16x16x16_f16";
+    using Inst_VOP3P_MAI__V_MFMA_F32_16X16X16_F16 =
+        Inst_VOP3P_MAI__V_MFMA_MXFP<16, 16, 16, 1, AMDGPU::mxfloat16,
+                                    &MNEM__V_MFMA_F32_16X16X16_F16>;
+
+    static const char *MNEM__V_MFMA_F32_16X16X4_4B_F16 =
+        "v_mfma_f32_16x16x4_4b_f16";
+    using Inst_VOP3P_MAI__V_MFMA_F32_16X16X4_4B_F16 =
+        Inst_VOP3P_MAI__V_MFMA_MXFP<16, 16, 4, 4, AMDGPU::mxfloat16,
+                                    &MNEM__V_MFMA_F32_16X16X4_4B_F16>;
+
+    static const char *MNEM__V_MFMA_F32_32X32X4_2B_F16 =
+        "v_mfma_f32_32x32x4_2b_f16";
+    using Inst_VOP3P_MAI__V_MFMA_F32_32X32X4_2B_F16 =
+        Inst_VOP3P_MAI__V_MFMA_MXFP<32, 32, 4, 2, AMDGPU::mxfloat16,
+                                    &MNEM__V_MFMA_F32_32X32X4_2B_F16>;
+
+    static const char *NMEM__V_MFMA_F32_32X32X8_F16 =
+        "v_mfma_f32_32x32x8_f16";
+    using Inst_VOP3P_MAI__V_MFMA_F32_32X32X8_F16 =
+        Inst_VOP3P_MAI__V_MFMA_MXFP<32, 32, 8, 1, AMDGPU::mxfloat16,
+                                    &NMEM__V_MFMA_F32_32X32X8_F16>;
+
+    static const char *MNEM__V_MFMA_F32_4X4X4_16B_F16 =
+        "v_mfma_f32_4x4x4_16b_f16";
+    using Inst_VOP3P_MAI__V_MFMA_F32_4X4X4_16B_F16 =
+        Inst_VOP3P_MAI__V_MFMA_MXFP<4, 4, 4, 16, AMDGPU::mxfloat16,
+                                    &MNEM__V_MFMA_F32_4X4X4_16B_F16>;
+
+    static const char *MNEM__V_MFMA_F32_32X32X8_BF16 =
+        "v_mfma_f32_32x32x8_bf16";
+    using Inst_VOP3P_MAI__V_MFMA_F32_32X32X8_BF16 =
+        Inst_VOP3P_MAI__V_MFMA_MXFP<32, 32, 8, 1, AMDGPU::mxbfloat16,
+                                    &MNEM__V_MFMA_F32_32X32X8_BF16>;
+
+
+    template <const int M, const int N, const int K,
+              const int B, const char **MNEMONIC>
+    class Inst_VOP3P_MAI__V_MFMA_I8 : public Inst_VOP3P_MAI
+    {
+
+    private:
+      // Only int8 exists at the moment, but make the type a parameter.
+      using DT = int8_t;
+      static constexpr int DT_bits = sizeof(DT) * 8;
+
+      // Scale GPRs needed by elements / GPR (gpr_ratio)
+      static constexpr int gpr_ratio = 32 / DT_bits;
+      static constexpr int gprs_a = M * K * B / (64 * gpr_ratio);
+      static constexpr int gprs_b = K * N * B / (64 * gpr_ratio);
+
+      // Always F32 which has an effective gpr_ratio of 1
+      static constexpr int gprs_c_d = M * N * B / 64;
+
+    public:
+      Inst_VOP3P_MAI__V_MFMA_I8(InFmt_VOP3P_MAI *iFmt)
+          : Inst_VOP3P_MAI(iFmt, *MNEMONIC)
+      {
+        setFlag(ALU);
+        setFlag(MFMA);
+        setFlag(I8);
+      }
+      ~Inst_VOP3P_MAI__V_MFMA_I8() {}
+
+      int getNumOperands() override {
+        return numDstRegOperands() + numSrcRegOperands();
+      } // getNumOperands
+
+      int numDstRegOperands() override { return 1; }
+      int numSrcRegOperands() override { return 3; }
+
+      int getOperandSize(int opIdx) override {
+        switch (opIdx) {
+        case 0: // src0 "A"
+          return 4*gprs_a;
+        case 1: // src1 "B"
+          return 4*gprs_b;
+        case 2: // src2 "C"
+          return 4*gprs_c_d;
+        case 3: // dst
+          return 4*gprs_c_d;
+        default:
+          fatal("op idx %i out of bounds\n", opIdx);
+          return -1;
+        }
+      } // getOperandSize
+
+    void
+    execute(GPUDynInstPtr gpuDynInst) override
+    {
+        int acc_cd_off = 0;
+        int acc_a_off = 0;
+        int acc_b_off = 0;
+        if (instData.ACC_CD) {
+            acc_cd_off = gpuDynInst->wavefront()->accumOffset;
+        }
+        if (extData.ACC) {
+            int tmp_acc = extData.ACC;
+            if (tmp_acc & 0x1) {
+                acc_a_off = gpuDynInst->wavefront()->accumOffset;
+            }
+            if (tmp_acc & 0x2) {
+                acc_b_off = gpuDynInst->wavefront()->accumOffset;
+            }
+        }
+
+        // Read the packed types as U32 - Consider this "untyped."
+        // A ConstVecOperand needs to be used for src2 as it could be an
+        // inline constant. The Const version provides an operator[] overload
+        // to read inline constants to each lane. The non-const type of src2
+        // should be used for vdst to make it writeable.
+        using T1 = ConstVecOperandU32;
+        using T2 = ConstVecOperandI32;
+        using T3 = VecOperandI32;
+
+        alignas(T1) std::byte _src0[sizeof(T1) * gprs_a];
+        alignas(T1) std::byte _src1[sizeof(T1) * gprs_b];
+        alignas(T2) std::byte _src2[sizeof(T2) * gprs_c_d];
+        alignas(T3) std::byte _vdst[sizeof(T3) * gprs_c_d];
+        T1 *src0 = std::launder(reinterpret_cast<T1*>(&_src0));
+        T1 *src1 = std::launder(reinterpret_cast<T1*>(&_src1));
+        T2 *src2 = std::launder(reinterpret_cast<T2*>(&_src2));
+        T3 *vdst = std::launder(reinterpret_cast<T3*>(&_vdst));
+
+        // Handling of src2 is a bit tricky. The operator[] overload cannot
+        // be used for dword count > 2, and the dword count here is 4. Usually
+        // src2 is a VGPR/AccGPR, but it might also be constant. In order to
+        // use operator[] and handle constants, check for VGPR here and set
+        // a delta for each of the src2 GPRs.
+
+        int delta = isVectorReg(extData.SRC0) ? 1 : 0;
+        for (int i = 0; i < gprs_a; i++) {
+            new (&src0[i]) T1(gpuDynInst, extData.SRC0+acc_a_off+i*delta);
+            src0[i].readSrc();
+        }
+
+        delta = isVectorReg(extData.SRC1) ? 1 : 0;
+        for (int i = 0; i < gprs_b; i++) {
+            new (&src1[i]) T1(gpuDynInst, extData.SRC1+acc_b_off+i*delta);
+            src1[i].readSrc();
+        }
+
+        delta = isVectorReg(extData.SRC2) ? 1 : 0;
+        for (int i = 0; i < gprs_c_d; i++) {
+            new (&src2[i]) T2(gpuDynInst, extData.SRC2+acc_cd_off+i*delta);
+            src2[i].readSrc();
+        }
+
+        for (int i = 0; i < gprs_c_d; i++) {
+            new (&vdst[i]) T3(gpuDynInst, instData.VDST+acc_cd_off+i);
+        }
+
+        // These values and meanings are described in the MI300 ISA manual:
+        //
+        // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
+        //    instruction-set-architectures/
+        //    amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
+        //
+        // in section 7.1.4.2. In theory, only the M, N, K, and H values change
+        // for each MFMA instruction.
+
+        // Output layout
+        constexpr int H = 4;
+
+        // This replaces `constexpr int B_I = std::ceil(64.0f / (N * M / H));`
+        // which failed clang compiler tests as it's not a constant expression.
+        constexpr float B_I_f = 64.0f / (N * M / H);
+        constexpr int B_I =
+            (static_cast<float>(static_cast<int>(B_I_f)) == B_I_f)
+            ? static_cast<int32_t>(B_I_f)
+            : static_cast<int32_t>(B_I_f) + ((B_I_f > 0) ? 1 : 0);
+
+        constexpr int M_I = (64 / B_I) / N;
+        constexpr int G = M / (H * M_I);
+
+        int32_t result[M][N];
+
+        // Input layout
+        constexpr int K_L = K / (64 / (M * B));
+
+        for (int block = 0; block < B; block++) {
+            // Load src2 into result. src2 is row major
+            for (int i = 0; i < M; ++i) {
+                for (int j = 0; j < N; ++j) {
+                    int item = (i % H) + H * (i/(H*M_I) + G * (block / B_I));
+                    int lane = j + N * ((i / H) % M_I + M_I * (block % B_I));
+
+                    result[i][j] = src2[item][lane];
+                }
+            }
+
+            // Compute new result
+            for (int i = 0; i < M; ++i) {
+                for (int j = 0; j < N; ++j) {
+                    for (int k = 0; k < K; ++k) {
+                        // src0 is column major, src1 is row major
+                        int lane_A = i + M * (block + B * (k / K_L));
+                        int lane_B = j + N * (block + B * (k / K_L));
+                        int item = k % K_L;
+
+                        PackedReg<K_L * DT_bits, DT_bits> A_elems;
+                        PackedReg<K_L * DT_bits, DT_bits> B_elems;
+
+                        for (int i = 0; i < gprs_a; ++i) {
+                            A_elems.setDword(i, src0[i][lane_A]);
+                        }
+                        for (int i = 0; i < gprs_b; ++i) {
+                            B_elems.setDword(i, src1[i][lane_B]);
+                        }
+
+                        DT item_A(A_elems.getElem(item));
+                        DT item_B(B_elems.getElem(item));
+
+                        result[i][j] += int32_t(item_A) * int32_t(item_B);
+                    }
+                }
+            }
+
+            for (int i = 0; i < M; ++i) {
+                for (int j = 0; j < N; ++j) {
+                    int item = (i % H) + H * (i/(H*M_I) + G * (block / B_I));
+                    int lane = j + N * ((i / H) % M_I + M_I * (block % B_I));
+
+                    vdst[item][lane] = result[i][j];
+                }
+            }
+        }
+
+        for (int i = 0; i < gprs_c_d; ++i) {
+            vdst[i].write();
+        }
+
+        for (int i = 0; i < gprs_a; i++) {
+            std::destroy_at(&src0[i]);
+        }
+        for (int i = 0; i < gprs_b; i++) {
+            std::destroy_at(&src1[i]);
+        }
+        for (int i = 0; i < gprs_c_d; i++) {
+            std::destroy_at(&src2[i]);
+        }
+        for (int i = 0; i < gprs_c_d; i++) {
+            std::destroy_at(&vdst[i]);
+        }
+    } // execute
+    };
+
+    static const char *MNEM__V_MFMA_I32_16X16X16_I8 =
+        "v_mfma_i32_16x16x16_i8";
+    using Inst_VOP3P_MAI__V_MFMA_I32_16X16X16_I8 =
+        Inst_VOP3P_MAI__V_MFMA_I8<16, 16, 16, 1,
+                                  &MNEM__V_MFMA_I32_16X16X16_I8>;
+
+
+    class Inst_VOP3__V_CVT_PK_FP8_F32 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_CVT_PK_FP8_F32(InFmt_VOP3A*);
+        ~Inst_VOP3__V_CVT_PK_FP8_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_CVT_PK_FP8_F32
 } // namespace VegaISA
 } // namespace gem5
 
diff --git a/src/arch/amdgpu/vega/insts/mimg.cc b/src/arch/amdgpu/vega/insts/mimg.cc
new file mode 100644
index 0000000000..29a37cca1d
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/mimg.cc
@@ -0,0 +1,2047 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_MIMG__IMAGE_LOAD class methods ---
+
+    Inst_MIMG__IMAGE_LOAD::Inst_MIMG__IMAGE_LOAD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD
+
+    Inst_MIMG__IMAGE_LOAD::~Inst_MIMG__IMAGE_LOAD()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD
+
+    // --- description from .arch file ---
+    // Image memory load with format conversion specified in T#. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_MIP class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_MIP::Inst_MIMG__IMAGE_LOAD_MIP(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_mip")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_MIP
+
+    Inst_MIMG__IMAGE_LOAD_MIP::~Inst_MIMG__IMAGE_LOAD_MIP()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_MIP
+
+    // --- description from .arch file ---
+    // Image memory load with user-supplied mip level. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_PCK class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_PCK::Inst_MIMG__IMAGE_LOAD_PCK(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_PCK
+
+    Inst_MIMG__IMAGE_LOAD_PCK::~Inst_MIMG__IMAGE_LOAD_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_PCK
+
+    // --- description from .arch file ---
+    // Image memory load with no format conversion. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_PCK_SGN class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::Inst_MIMG__IMAGE_LOAD_PCK_SGN(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_pck_sgn")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_PCK_SGN
+
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_PCK_SGN()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_PCK_SGN
+
+    // --- description from .arch file ---
+    // Image memory load with with no format conversion and sign extension. No
+    // ---  sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::Inst_MIMG__IMAGE_LOAD_MIP_PCK(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_mip_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::~Inst_MIMG__IMAGE_LOAD_MIP_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK
+
+    // --- description from .arch file ---
+    // Image memory load with user-supplied mip level, no format conversion. No
+    // ---  sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN class methods ---
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_load_mip_pck_sgn")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
+
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN()
+    {
+    } // ~Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN
+
+    // --- description from .arch file ---
+    // Image memory load with user-supplied mip level, no format conversion and
+    // ---  with sign extension. No sampler.
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_LOAD_MIP_PCK_SGN::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE class methods ---
+
+    Inst_MIMG__IMAGE_STORE::Inst_MIMG__IMAGE_STORE(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE
+
+    Inst_MIMG__IMAGE_STORE::~Inst_MIMG__IMAGE_STORE()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE
+
+    // --- description from .arch file ---
+    // Image memory store with format conversion specified in T#. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE_MIP class methods ---
+
+    Inst_MIMG__IMAGE_STORE_MIP::Inst_MIMG__IMAGE_STORE_MIP(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store_mip")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE_MIP
+
+    Inst_MIMG__IMAGE_STORE_MIP::~Inst_MIMG__IMAGE_STORE_MIP()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE_MIP
+
+    // --- description from .arch file ---
+    // Image memory store with format conversion specified in T# to user
+    // specified mip level. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE_MIP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE_PCK class methods ---
+
+    Inst_MIMG__IMAGE_STORE_PCK::Inst_MIMG__IMAGE_STORE_PCK(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE_PCK
+
+    Inst_MIMG__IMAGE_STORE_PCK::~Inst_MIMG__IMAGE_STORE_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE_PCK
+
+    // --- description from .arch file ---
+    // Image memory store of packed data without format conversion. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_STORE_MIP_PCK class methods ---
+
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::Inst_MIMG__IMAGE_STORE_MIP_PCK(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_store_mip_pck")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_STORE_MIP_PCK
+
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::~Inst_MIMG__IMAGE_STORE_MIP_PCK()
+    {
+    } // ~Inst_MIMG__IMAGE_STORE_MIP_PCK
+
+    // --- description from .arch file ---
+    // Image memory store of packed data without format conversion to
+    // user-supplied mip level. No sampler.
+    void
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MIMG__IMAGE_STORE_MIP_PCK::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MIMG__IMAGE_GET_RESINFO class methods ---
+
+    Inst_MIMG__IMAGE_GET_RESINFO::Inst_MIMG__IMAGE_GET_RESINFO(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_get_resinfo")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GET_RESINFO
+
+    Inst_MIMG__IMAGE_GET_RESINFO::~Inst_MIMG__IMAGE_GET_RESINFO()
+    {
+    } // ~Inst_MIMG__IMAGE_GET_RESINFO
+
+    // --- description from .arch file ---
+    // return resource info for a given mip level specified in the address
+    // vgpr. No sampler. Returns 4 integer values into VGPRs 3-0:
+    // {num_mip_levels, depth, height, width}.
+    void
+    Inst_MIMG__IMAGE_GET_RESINFO::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SWAP class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SWAP::Inst_MIMG__IMAGE_ATOMIC_SWAP(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_swap")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SWAP
+
+    Inst_MIMG__IMAGE_ATOMIC_SWAP::~Inst_MIMG__IMAGE_ATOMIC_SWAP()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_CMPSWAP class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::Inst_MIMG__IMAGE_ATOMIC_CMPSWAP(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_cmpswap")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
+
+    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_CMPSWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0];
+    // cmp = DATA[1];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_ADD class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_ADD::Inst_MIMG__IMAGE_ATOMIC_ADD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_add")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_ADD
+
+    Inst_MIMG__IMAGE_ATOMIC_ADD::~Inst_MIMG__IMAGE_ATOMIC_ADD()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_ADD
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SUB class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SUB::Inst_MIMG__IMAGE_ATOMIC_SUB(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_sub")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SUB
+
+    Inst_MIMG__IMAGE_ATOMIC_SUB::~Inst_MIMG__IMAGE_ATOMIC_SUB()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SUB
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SMIN class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SMIN::Inst_MIMG__IMAGE_ATOMIC_SMIN(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_smin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SMIN
+
+    Inst_MIMG__IMAGE_ATOMIC_SMIN::~Inst_MIMG__IMAGE_ATOMIC_SMIN()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_UMIN class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_UMIN::Inst_MIMG__IMAGE_ATOMIC_UMIN(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_umin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_UMIN
+
+    Inst_MIMG__IMAGE_ATOMIC_UMIN::~Inst_MIMG__IMAGE_ATOMIC_UMIN()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_UMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_SMAX class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_SMAX::Inst_MIMG__IMAGE_ATOMIC_SMAX(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_smax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_SMAX
+
+    Inst_MIMG__IMAGE_ATOMIC_SMAX::~Inst_MIMG__IMAGE_ATOMIC_SMAX()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_SMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_UMAX class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_UMAX::Inst_MIMG__IMAGE_ATOMIC_UMAX(
+        InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_umax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_UMAX
+
+    Inst_MIMG__IMAGE_ATOMIC_UMAX::~Inst_MIMG__IMAGE_ATOMIC_UMAX()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_UMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_AND class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_AND::Inst_MIMG__IMAGE_ATOMIC_AND(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_and")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_AND
+
+    Inst_MIMG__IMAGE_ATOMIC_AND::~Inst_MIMG__IMAGE_ATOMIC_AND()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_AND
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_OR class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_OR::Inst_MIMG__IMAGE_ATOMIC_OR(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_or")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_OR
+
+    Inst_MIMG__IMAGE_ATOMIC_OR::~Inst_MIMG__IMAGE_ATOMIC_OR()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_OR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_XOR class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_XOR::Inst_MIMG__IMAGE_ATOMIC_XOR(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_xor")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_XOR
+
+    Inst_MIMG__IMAGE_ATOMIC_XOR::~Inst_MIMG__IMAGE_ATOMIC_XOR()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_XOR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_INC class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_INC::Inst_MIMG__IMAGE_ATOMIC_INC(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_inc")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_INC
+
+    Inst_MIMG__IMAGE_ATOMIC_INC::~Inst_MIMG__IMAGE_ATOMIC_INC()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_INC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_ATOMIC_DEC class methods ---
+
+    Inst_MIMG__IMAGE_ATOMIC_DEC::Inst_MIMG__IMAGE_ATOMIC_DEC(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_atomic_dec")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_ATOMIC_DEC
+
+    Inst_MIMG__IMAGE_ATOMIC_DEC::~Inst_MIMG__IMAGE_ATOMIC_DEC()
+    {
+    } // ~Inst_MIMG__IMAGE_ATOMIC_DEC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_MIMG__IMAGE_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE::Inst_MIMG__IMAGE_SAMPLE(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample")
+    {
+    } // Inst_MIMG__IMAGE_SAMPLE
+
+    Inst_MIMG__IMAGE_SAMPLE::~Inst_MIMG__IMAGE_SAMPLE()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE
+
+    // --- description from .arch file ---
+    // sample texture map.
+    void
+    Inst_MIMG__IMAGE_SAMPLE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CL::Inst_MIMG__IMAGE_SAMPLE_CL(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_CL::~Inst_MIMG__IMAGE_SAMPLE_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D::Inst_MIMG__IMAGE_SAMPLE_D(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D
+
+    Inst_MIMG__IMAGE_SAMPLE_D::~Inst_MIMG__IMAGE_SAMPLE_D()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D
+
+    // --- description from .arch file ---
+    // sample texture map, with user derivatives
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL::Inst_MIMG__IMAGE_SAMPLE_D_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL::~Inst_MIMG__IMAGE_SAMPLE_D_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader, with user
+    // ---  derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_L class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_L::Inst_MIMG__IMAGE_SAMPLE_L(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_L
+
+    Inst_MIMG__IMAGE_SAMPLE_L::~Inst_MIMG__IMAGE_SAMPLE_L()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_L
+
+    // --- description from .arch file ---
+    // sample texture map, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B::Inst_MIMG__IMAGE_SAMPLE_B(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B
+
+    Inst_MIMG__IMAGE_SAMPLE_B::~Inst_MIMG__IMAGE_SAMPLE_B()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B
+
+    // --- description from .arch file ---
+    // sample texture map, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL::Inst_MIMG__IMAGE_SAMPLE_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL::~Inst_MIMG__IMAGE_SAMPLE_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_LZ class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ::Inst_MIMG__IMAGE_SAMPLE_LZ(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_LZ
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ::~Inst_MIMG__IMAGE_SAMPLE_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ
+
+    // --- description from .arch file ---
+    // sample texture map, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C::Inst_MIMG__IMAGE_SAMPLE_C(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C
+
+    Inst_MIMG__IMAGE_SAMPLE_C::~Inst_MIMG__IMAGE_SAMPLE_C()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C
+
+    // --- description from .arch file ---
+    // sample texture map, with PCF.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL::Inst_MIMG__IMAGE_SAMPLE_C_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D::Inst_MIMG__IMAGE_SAMPLE_C_D(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D::~Inst_MIMG__IMAGE_SAMPLE_C_D()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::Inst_MIMG__IMAGE_SAMPLE_C_D_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_L class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L::Inst_MIMG__IMAGE_SAMPLE_C_L(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_L
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L::~Inst_MIMG__IMAGE_SAMPLE_C_L()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B::Inst_MIMG__IMAGE_SAMPLE_C_B(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B::~Inst_MIMG__IMAGE_SAMPLE_C_B()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::Inst_MIMG__IMAGE_SAMPLE_C_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ::Inst_MIMG__IMAGE_SAMPLE_C_LZ(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ::~Inst_MIMG__IMAGE_SAMPLE_C_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ
+
+    // --- description from .arch file ---
+    // SAMPLE_C, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_O::Inst_MIMG__IMAGE_SAMPLE_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_O
+
+    Inst_MIMG__IMAGE_SAMPLE_O::~Inst_MIMG__IMAGE_SAMPLE_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_O
+
+    // --- description from .arch file ---
+    // sample texture map, with user offsets.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CL_O::Inst_MIMG__IMAGE_SAMPLE_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D_O::Inst_MIMG__IMAGE_SAMPLE_D_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D_O
+
+    Inst_MIMG__IMAGE_SAMPLE_D_O::~Inst_MIMG__IMAGE_SAMPLE_D_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_D_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_D_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_d_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_D_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_D_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_D_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_L_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_L_O::Inst_MIMG__IMAGE_SAMPLE_L_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_L_O
+
+    Inst_MIMG__IMAGE_SAMPLE_L_O::~Inst_MIMG__IMAGE_SAMPLE_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_L_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B_O::Inst_MIMG__IMAGE_SAMPLE_B_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B_O
+
+    Inst_MIMG__IMAGE_SAMPLE_B_O::~Inst_MIMG__IMAGE_SAMPLE_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_B_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_B_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ_O::Inst_MIMG__IMAGE_SAMPLE_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_LZ_O
+
+    Inst_MIMG__IMAGE_SAMPLE_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_LZ_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_O::Inst_MIMG__IMAGE_SAMPLE_C_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_O::~Inst_MIMG__IMAGE_SAMPLE_C_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C with user specified offsets.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_O::Inst_MIMG__IMAGE_SAMPLE_C_D_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_d_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_D_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_L_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L_O::Inst_MIMG__IMAGE_SAMPLE_C_L_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_L_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_L_O::~Inst_MIMG__IMAGE_SAMPLE_C_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_L_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with user LOD.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_O::Inst_MIMG__IMAGE_SAMPLE_C_B_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader, with lod bias.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::Inst_MIMG__IMAGE_SAMPLE_C_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_LZ_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, from level 0.
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4 class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4::Inst_MIMG__IMAGE_GATHER4(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4
+
+    Inst_MIMG__IMAGE_GATHER4::~Inst_MIMG__IMAGE_GATHER4()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2).
+    void
+    Inst_MIMG__IMAGE_GATHER4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_CL::Inst_MIMG__IMAGE_GATHER4_CL(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_CL
+
+    Inst_MIMG__IMAGE_GATHER4_CL::~Inst_MIMG__IMAGE_GATHER4_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD clamp.
+    void
+    Inst_MIMG__IMAGE_GATHER4_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_L class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_L::Inst_MIMG__IMAGE_GATHER4_L(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_L
+
+    Inst_MIMG__IMAGE_GATHER4_L::~Inst_MIMG__IMAGE_GATHER4_L()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_L
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD.
+    void
+    Inst_MIMG__IMAGE_GATHER4_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B::Inst_MIMG__IMAGE_GATHER4_B(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B
+
+    Inst_MIMG__IMAGE_GATHER4_B::~Inst_MIMG__IMAGE_GATHER4_B()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL::Inst_MIMG__IMAGE_GATHER4_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B_CL
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL::~Inst_MIMG__IMAGE_GATHER4_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias and clamp.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_LZ class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_LZ::Inst_MIMG__IMAGE_GATHER4_LZ(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_LZ
+
+    Inst_MIMG__IMAGE_GATHER4_LZ::~Inst_MIMG__IMAGE_GATHER4_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_LZ
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) at level 0.
+    void
+    Inst_MIMG__IMAGE_GATHER4_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C::Inst_MIMG__IMAGE_GATHER4_C(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C
+
+    Inst_MIMG__IMAGE_GATHER4_C::~Inst_MIMG__IMAGE_GATHER4_C()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL::Inst_MIMG__IMAGE_GATHER4_C_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_CL
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL::~Inst_MIMG__IMAGE_GATHER4_C_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD clamp and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_L class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_L::Inst_MIMG__IMAGE_GATHER4_C_L(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_l")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_L
+
+    Inst_MIMG__IMAGE_GATHER4_C_L::~Inst_MIMG__IMAGE_GATHER4_C_L()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_L
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user LOD and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_L::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B::Inst_MIMG__IMAGE_GATHER4_C_B(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B
+
+    Inst_MIMG__IMAGE_GATHER4_C_B::~Inst_MIMG__IMAGE_GATHER4_C_B()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL::Inst_MIMG__IMAGE_GATHER4_C_B_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL::~Inst_MIMG__IMAGE_GATHER4_C_B_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) with user bias, clamp and PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ::Inst_MIMG__IMAGE_GATHER4_C_LZ(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_lz")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_LZ
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ::~Inst_MIMG__IMAGE_GATHER4_C_LZ()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ
+
+    // --- description from .arch file ---
+    // gather 4 single component elements (2x2) at level 0, with PCF.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_LZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_O::Inst_MIMG__IMAGE_GATHER4_O(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_O
+
+    Inst_MIMG__IMAGE_GATHER4_O::~Inst_MIMG__IMAGE_GATHER4_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_O
+
+    // --- description from .arch file ---
+    // GATHER4, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_CL_O::Inst_MIMG__IMAGE_GATHER4_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_CL_O::~Inst_MIMG__IMAGE_GATHER4_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_L_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_L_O::Inst_MIMG__IMAGE_GATHER4_L_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_L_O
+
+    Inst_MIMG__IMAGE_GATHER4_L_O::~Inst_MIMG__IMAGE_GATHER4_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_L_O
+
+    // --- description from .arch file ---
+    // GATHER4_L, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B_O::Inst_MIMG__IMAGE_GATHER4_B_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B_O
+
+    Inst_MIMG__IMAGE_GATHER4_B_O::~Inst_MIMG__IMAGE_GATHER4_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B_O
+
+    // --- description from .arch file ---
+    // GATHER4_B, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL_O::Inst_MIMG__IMAGE_GATHER4_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_B_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_B_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_B_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_LZ_O::Inst_MIMG__IMAGE_GATHER4_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_LZ_O
+
+    Inst_MIMG__IMAGE_GATHER4_LZ_O::~Inst_MIMG__IMAGE_GATHER4_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_LZ_O
+
+    // --- description from .arch file ---
+    // GATHER4_LZ, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_O::Inst_MIMG__IMAGE_GATHER4_C_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_O::~Inst_MIMG__IMAGE_GATHER4_C_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_O
+
+    // --- description from .arch file ---
+    // GATHER4_C, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL_O::Inst_MIMG__IMAGE_GATHER4_C_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_C_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_L_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_L_O::Inst_MIMG__IMAGE_GATHER4_C_L_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_l_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_L_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_L_O::~Inst_MIMG__IMAGE_GATHER4_C_L_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_L_O
+
+    // --- description from .arch file ---
+    // GATHER4_C_L, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_L_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_O::Inst_MIMG__IMAGE_GATHER4_C_B_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_O::~Inst_MIMG__IMAGE_GATHER4_C_B_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_O
+
+    // --- description from .arch file ---
+    // GATHER4_B, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_B_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::Inst_MIMG__IMAGE_GATHER4_C_B_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_b_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_B_CL_O
+
+    // --- description from .arch file ---
+    // GATHER4_B_CL, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_B_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GATHER4_C_LZ_O class methods ---
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::Inst_MIMG__IMAGE_GATHER4_C_LZ_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_gather4_c_lz_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GATHER4_C_LZ_O
+
+    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::~Inst_MIMG__IMAGE_GATHER4_C_LZ_O()
+    {
+    } // ~Inst_MIMG__IMAGE_GATHER4_C_LZ_O
+
+    // --- description from .arch file ---
+    // GATHER4_C_LZ, with user offsets.
+    void
+    Inst_MIMG__IMAGE_GATHER4_C_LZ_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_GET_LOD class methods ---
+
+    Inst_MIMG__IMAGE_GET_LOD::Inst_MIMG__IMAGE_GET_LOD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_get_lod")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_GET_LOD
+
+    Inst_MIMG__IMAGE_GET_LOD::~Inst_MIMG__IMAGE_GET_LOD()
+    {
+    } // ~Inst_MIMG__IMAGE_GET_LOD
+
+    // --- description from .arch file ---
+    // Return calculated LOD. Vdata gets 2 32bit integer values: { rawLOD,
+    // ---  clampedLOD }.
+    void
+    Inst_MIMG__IMAGE_GET_LOD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD::Inst_MIMG__IMAGE_SAMPLE_CD(InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD
+
+    Inst_MIMG__IMAGE_SAMPLE_CD::~Inst_MIMG__IMAGE_SAMPLE_CD()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD
+
+    // --- description from .arch file ---
+    // sample texture map, with user derivatives (LOD per quad)
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL::Inst_MIMG__IMAGE_SAMPLE_CD_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_CD_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL
+
+    // --- description from .arch file ---
+    // sample texture map, with LOD clamp specified in shader, with user
+    // ---  derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD::Inst_MIMG__IMAGE_SAMPLE_C_CD(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD::~Inst_MIMG__IMAGE_SAMPLE_C_CD()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with user derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd_cl")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL
+
+    // --- description from .arch file ---
+    // SAMPLE_C, with LOD clamp specified in shader, with user derivatives
+    // (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_O::Inst_MIMG__IMAGE_SAMPLE_CD_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD_O
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_O::~Inst_MIMG__IMAGE_SAMPLE_CD_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with user derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_CD_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_CD_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_cd_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_CD_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_O, with LOD clamp specified in shader, with user derivatives
+    // (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with user derivatives (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O class methods ---
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O(
+          InFmt_MIMG *iFmt)
+        : Inst_MIMG(iFmt, "image_sample_c_cd_cl_o")
+    {
+        setFlag(GlobalSegment);
+    } // Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
+
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O()
+    {
+    } // ~Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O
+
+    // --- description from .arch file ---
+    // SAMPLE_C_O, with LOD clamp specified in shader, with user derivatives
+    // (LOD per quad).
+    void
+    Inst_MIMG__IMAGE_SAMPLE_C_CD_CL_O::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/mtbuf.cc b/src/arch/amdgpu/vega/insts/mtbuf.cc
new file mode 100644
index 0000000000..2b37dfd6b9
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/mtbuf.cc
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_X class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_X
+
+    // --- description from .arch file ---
+    // Typed buffer load 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Typed buffer load 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer load 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer load 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_X class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::~Inst_MTBUF__TBUFFER_STORE_FORMAT_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_X
+
+    // --- description from .arch file ---
+    // Typed buffer store 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Typed buffer store 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer store 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer store 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::
+        ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Typed buffer load 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::initiateAcc(
+          GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_X::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Typed buffer load 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ(
+          InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer load 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+        ::Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW(
+          InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_load_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+        ::~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer load 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Typed buffer store 1 dword with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_X::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Typed buffer store 2 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Typed buffer store 3 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
+          GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW class methods ---
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+        ::Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW(InFmt_MTBUF *iFmt)
+        : Inst_MTBUF(iFmt, "tbuffer_store_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+        ::~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Typed buffer store 4 dwords with format conversion.
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::execute(
+        GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MTBUF__TBUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/mubuf.cc b/src/arch/amdgpu/vega/insts/mubuf.cc
new file mode 100644
index 0000000000..ffc68e8c2b
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/mubuf.cc
@@ -0,0 +1,2996 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_X class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_X
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::~Inst_MUBUF__BUFFER_LOAD_FORMAT_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_X
+
+    // --- description from .arch file ---
+    // Untyped buffer load 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XY class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_X class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_X
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::~Inst_MUBUF__BUFFER_STORE_FORMAT_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_X
+
+    // --- description from .arch file ---
+    // Untyped buffer store 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XY class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XY
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::~Inst_MUBUF__BUFFER_STORE_FORMAT_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XY::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZ::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_XYZW::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_x")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Untyped buffer load 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_X::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+        ::~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_x")
+    {
+        setFlag(Store);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X
+
+    // --- description from .arch file ---
+    // Untyped buffer store 1 dword with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_X::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xy")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XY::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyz")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZ::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+        ::Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_format_d16_xyzw")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+        ::~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords with format conversion.
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::initiateAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_FORMAT_D16_XYZW::completeAcc(
+        GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_UBYTE class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_UBYTE
+        ::Inst_MUBUF__BUFFER_LOAD_UBYTE(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_ubyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_UBYTE
+
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::~Inst_MUBUF__BUFFER_LOAD_UBYTE()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_UBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned byte (zero extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_UBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+
+    // --- Inst_MUBUF__BUFFER_LOAD_SBYTE class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_SBYTE
+        ::Inst_MUBUF__BUFFER_LOAD_SBYTE(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_sbyte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_SBYTE
+
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::~Inst_MUBUF__BUFFER_LOAD_SBYTE()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_SBYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed byte (sign extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SBYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_USHORT class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_USHORT
+        ::Inst_MUBUF__BUFFER_LOAD_USHORT(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_ushort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_USHORT
+
+    Inst_MUBUF__BUFFER_LOAD_USHORT::~Inst_MUBUF__BUFFER_LOAD_USHORT()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_USHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load unsigned short (zero extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_USHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_USHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_USHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+
+    // --- Inst_MUBUF__BUFFER_LOAD_SSHORT class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_SSHORT
+        ::Inst_MUBUF__BUFFER_LOAD_SSHORT(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_sshort")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_LOAD_SSHORT
+
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::~Inst_MUBUF__BUFFER_LOAD_SSHORT()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_SSHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer load signed short (sign extend to VGPR destination).
+    void
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_LOAD_SHORT_D16 class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16
+        ::Inst_MUBUF__BUFFER_LOAD_SHORT_D16(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_short_d16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+            warn("BUFFER.LDS not implemented!");
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_SHORT_D16
+
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16::~Inst_MUBUF__BUFFER_LOAD_SHORT_D16()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16
+
+    // --- description from .arch file ---
+    // RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16;
+    // // RETURN_DATA[31:16] is preserved.
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        // For explanation of buffer addressing, see section 9.1.5 in:
+        // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
+        //    instruction-set-architectures/
+        //    amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    VecElemU16 buf_val = (reinterpret_cast<VecElemU16*>(
+                        gpuDynInst->d_data))[lane];
+                    replaceBits(vdst[lane], 15, 0, buf_val);
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
+        ::Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_short_d16_hi")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+            warn("BUFFER.LDS not implemented!");
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
+
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::
+        ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI
+
+    // --- description from .arch file ---
+    // VDATA[31 : 16].b16 = MEM[ADDR].b16;
+    // // VDATA[15:0] is preserved.
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        // For explanation of buffer addressing, see section 9.1.5 in:
+        // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
+        //    instruction-set-architectures/
+        //    amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    VecElemU16 buf_val = (reinterpret_cast<VecElemU16*>(
+                        gpuDynInst->d_data))[lane];
+                    replaceBits(vdst[lane], 31, 16, buf_val);
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORD class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORD
+        ::Inst_MUBUF__BUFFER_LOAD_DWORD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORD
+
+    Inst_MUBUF__BUFFER_LOAD_DWORD::~Inst_MUBUF__BUFFER_LOAD_DWORD()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer load dword.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        // For explanation of buffer addressing, see section 9.1.5 in:
+        // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
+        //    instruction-set-architectures/
+        //    amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane];
+                } else {
+                    vdst[lane] = 0;
+                }
+            }
+        }
+
+        vdst.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX2 class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2
+        ::Inst_MUBUF__BUFFER_LOAD_DWORDX2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORDX2
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer load 2 dwords.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 2];
+                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 2 + 1];
+                } else {
+                    vdst0[lane] = 0;
+                    vdst1[lane] = 0;
+                }
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX3 class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3
+        ::Inst_MUBUF__BUFFER_LOAD_DWORDX3(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORDX3
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer load 3 dwords.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 3];
+                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 3 + 1];
+                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 3 + 2];
+                } else {
+                    vdst0[lane] = 0;
+                    vdst1[lane] = 0;
+                    vdst2[lane] = 0;
+                }
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_LOAD_DWORDX4 class methods ---
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4
+        ::Inst_MUBUF__BUFFER_LOAD_DWORDX4(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_LOAD_DWORDX4
+
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4()
+    {
+    } // ~Inst_MUBUF__BUFFER_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer load 4 dwords.
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+
+        rsrcDesc.read();
+        offset.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDATA);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2);
+        VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                if (!oobMask[lane]) {
+                    vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4];
+                    vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4 + 1];
+                    vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4 + 2];
+                    vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane * 4 + 3];
+                } else {
+                    vdst0[lane] = 0;
+                    vdst1[lane] = 0;
+                    vdst2[lane] = 0;
+                    vdst3[lane] = 0;
+                }
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+        vdst3.write();
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_BYTE class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_BYTE
+        ::Inst_MUBUF__BUFFER_STORE_BYTE(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_byte")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_BYTE
+
+    Inst_MUBUF__BUFFER_STORE_BYTE::~Inst_MUBUF__BUFFER_STORE_BYTE()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_BYTE
+
+    // --- description from .arch file ---
+    // Untyped buffer store byte.
+    void
+    Inst_MUBUF__BUFFER_STORE_BYTE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandI8 data(gpuDynInst, extData.VDATA);
+
+        rsrcDesc.read();
+        offset.read();
+        data.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+       gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemI8*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_BYTE::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemI8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_BYTE::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_SHORT class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_SHORT
+        ::Inst_MUBUF__BUFFER_STORE_SHORT(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_short")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_SHORT
+
+    Inst_MUBUF__BUFFER_STORE_SHORT::~Inst_MUBUF__BUFFER_STORE_SHORT()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_SHORT
+
+    // --- description from .arch file ---
+    // Untyped buffer store short.
+    void
+    Inst_MUBUF__BUFFER_STORE_SHORT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandI16 data(gpuDynInst, extData.VDATA);
+
+        rsrcDesc.read();
+        offset.read();
+        data.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemI16*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_SHORT::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemI16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_MUBUF__BUFFER_STORE_DWORD class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORD::
+        Inst_MUBUF__BUFFER_STORE_DWORD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORD
+
+    Inst_MUBUF__BUFFER_STORE_DWORD::~Inst_MUBUF__BUFFER_STORE_DWORD()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Untyped buffer store dword.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data(gpuDynInst, extData.VDATA);
+
+        rsrcDesc.read();
+        offset.read();
+        data.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane]
+                    = data[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_DWORDX2 class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX2
+        ::Inst_MUBUF__BUFFER_STORE_DWORDX2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORDX2
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Untyped buffer store 2 dwords.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+
+        rsrcDesc.read();
+        offset.read();
+        data0.read();
+        data1.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
+                    = data1[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_DWORDX3 class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX3
+        ::Inst_MUBUF__BUFFER_STORE_DWORDX3(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dwordx3")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORDX3
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX3
+
+    // --- description from .arch file ---
+    // Untyped buffer store 3 dwords.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
+
+        rsrcDesc.read();
+        offset.read();
+        data0.read();
+        data1.read();
+        data2.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
+                    = data1[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
+                    = data2[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<3>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_DWORDX4 class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX4
+        ::Inst_MUBUF__BUFFER_STORE_DWORDX4(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+        if (instData.LDS) {
+            setFlag(GroupSegment);
+        } else {
+            setFlag(GlobalSegment);
+        }
+    } // Inst_MUBUF__BUFFER_STORE_DWORDX4
+
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Untyped buffer store 4 dwords.
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            wf->decExpInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 data0(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3);
+
+        rsrcDesc.read();
+        offset.read();
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane * 4]
+                    = data0[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 1]
+                    = data1[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 2]
+                    = data2[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->d_data))[lane*4 + 3]
+                    = data3[lane];
+            }
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_STORE_LDS_DWORD class methods ---
+
+    Inst_MUBUF__BUFFER_STORE_LDS_DWORD
+        ::Inst_MUBUF__BUFFER_STORE_LDS_DWORD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_store_lds_dword")
+    {
+        setFlag(Store);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_STORE_LDS_DWORD
+
+    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::~Inst_MUBUF__BUFFER_STORE_LDS_DWORD()
+    {
+    } // ~Inst_MUBUF__BUFFER_STORE_LDS_DWORD
+
+    // --- description from .arch file ---
+    // Store one DWORD from LDS memory to system memory without utilizing
+    // VGPRs.
+    void
+    Inst_MUBUF__BUFFER_STORE_LDS_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_WBINVL1 class methods ---
+
+    Inst_MUBUF__BUFFER_WBINVL1::Inst_MUBUF__BUFFER_WBINVL1(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_wbinvl1")
+    {
+        setFlag(MemoryRef);
+        setFlag(GPUStaticInst::MemSync);
+        setFlag(GlobalSegment);
+        setFlag(MemSync);
+    } // Inst_MUBUF__BUFFER_WBINVL1
+
+    Inst_MUBUF__BUFFER_WBINVL1::~Inst_MUBUF__BUFFER_WBINVL1()
+    {
+    } // ~Inst_MUBUF__BUFFER_WBINVL1
+
+    // --- description from .arch file ---
+    // Write back and invalidate the shader L1.
+    // Always returns ACK to shader.
+    void
+    Inst_MUBUF__BUFFER_WBINVL1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            gpuDynInst->computeUnit()->globalMemoryPipe.
+                issueRequest(gpuDynInst);
+        } else {
+            fatal("Unsupported scope for flat instruction.\n");
+        }
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_WBINVL1::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // TODO: Fix it for gfx10. Once we have the new gfx10 cache model, we
+        // need to precisely communicate the writeback-invalidate operation to
+        // the new gfx10 coalescer rather than sending AcquireRelease markers.
+        // The SICoalescer would need to be updated appropriately as well.
+        injectGlobalMemFence(gpuDynInst);
+    } // initiateAcc
+    void
+    Inst_MUBUF__BUFFER_WBINVL1::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_WBINVL1_VOL class methods ---
+
+    Inst_MUBUF__BUFFER_WBINVL1_VOL
+        ::Inst_MUBUF__BUFFER_WBINVL1_VOL(InFmt_MUBUF*iFmt)
+        : Inst_MUBUF(iFmt, "buffer_wbinvl1_vol") {
+        // This instruction is same as buffer_wbinvl1 instruction except this
+        // instruction only invalidate L1 shader line with MTYPE SC and GC.
+        // Since Hermes L1 (TCP) do not differentiate between its cache lines,
+        // this instruction currently behaves (and implemented ) exactly like
+        // buffer_wbinvl1 instruction.
+        setFlag(MemoryRef);
+        setFlag(GPUStaticInst::MemSync);
+        setFlag(GlobalSegment);
+        setFlag(MemSync);
+    } // Inst_MUBUF__BUFFER_WBINVL1_VOL
+
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::~Inst_MUBUF__BUFFER_WBINVL1_VOL()
+    {
+    } // ~Inst_MUBUF__BUFFER_WBINVL1_VOL
+
+    // --- description from .arch file ---
+    // Write back and invalidate the shader L1 only for lines that are marked
+    // ---  volatile.
+    // Always returns ACK to shader.
+    void
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+            gpuDynInst->computeUnit()->globalMemoryPipe.
+                issueRequest(gpuDynInst);
+        } else {
+            fatal("Unsupported scope for flat instruction.\n");
+        }
+    } // execute
+    void
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        injectGlobalMemFence(gpuDynInst);
+    } // initiateAcc
+    void
+    Inst_MUBUF__BUFFER_WBINVL1_VOL::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP
+        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_swap")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP::~Inst_MUBUF__BUFFER_ATOMIC_SWAP()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
+        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0];
+    // cmp = DATA[1];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decVMemInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
+        ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
+        ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
+        ConstVecOperandU32 src(gpuDynInst, extData.VDATA);
+        ConstVecOperandU32 cmp(gpuDynInst, extData.VDATA + 1);
+
+        rsrcDesc.read();
+        offset.read();
+        src.read();
+        cmp.read();
+
+        int inst_offset = instData.OFFSET;
+
+        if (!instData.IDXEN && !instData.OFFEN) {
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (!instData.IDXEN && instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr0, addr1, rsrcDesc, offset, inst_offset);
+        } else if (instData.IDXEN && !instData.OFFEN) {
+            addr0.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        } else {
+            addr0.read();
+            addr1.read();
+            calcAddr<ConstVecOperandU32, ConstVecOperandU32,
+                ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
+                    addr1, addr0, rsrcDesc, offset, inst_offset);
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->x_data))[lane]
+                    = src[lane];
+                (reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
+                    = cmp[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initAtomicAccess<VecElemU32>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        if (isAtomicRet()) {
+            VecOperandU32 vdst(gpuDynInst, extData.VDATA);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane];
+                }
+            }
+
+            vdst.write();
+        }
+    } // completeAcc
+    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD
+        ::Inst_MUBUF__BUFFER_ATOMIC_ADD(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_add")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_ADD
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD::~Inst_MUBUF__BUFFER_ATOMIC_ADD()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_ADD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB
+        ::Inst_MUBUF__BUFFER_ATOMIC_SUB(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_sub")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SUB
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB::~Inst_MUBUF__BUFFER_ATOMIC_SUB()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN::~Inst_MUBUF__BUFFER_ATOMIC_SMIN()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umin")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN::~Inst_MUBUF__BUFFER_ATOMIC_UMIN()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA < tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX::~Inst_MUBUF__BUFFER_ATOMIC_SMAX()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (signed compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umax")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX::~Inst_MUBUF__BUFFER_ATOMIC_UMAX()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (DATA > tmp) ? DATA : tmp (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_AND class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND
+        ::Inst_MUBUF__BUFFER_ATOMIC_AND(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_and")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_AND
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND::~Inst_MUBUF__BUFFER_ATOMIC_AND()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_AND::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_OR class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR
+        ::Inst_MUBUF__BUFFER_ATOMIC_OR(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_or")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_OR
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR::~Inst_MUBUF__BUFFER_ATOMIC_OR()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_OR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR
+        ::Inst_MUBUF__BUFFER_ATOMIC_XOR(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_xor")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_XOR
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR::~Inst_MUBUF__BUFFER_ATOMIC_XOR()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA;
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_XOR::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_INC class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC
+        ::Inst_MUBUF__BUFFER_ATOMIC_INC(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_inc")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_INC
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC::~Inst_MUBUF__BUFFER_ATOMIC_INC()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC
+        ::Inst_MUBUF__BUFFER_ATOMIC_DEC(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_dec")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_DEC
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC::~Inst_MUBUF__BUFFER_ATOMIC_DEC()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC
+
+    // --- description from .arch file ---
+    // 32b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA) ? DATA : tmp - 1
+    // (unsigned compare); RETURN_DATA = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_swap_x2")
+    {
+        setFlag(AtomicExch);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_cmpswap_x2")
+    {
+        setFlag(AtomicCAS);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+        ::~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // src = DATA[0:1];
+    // cmp = DATA[2:3];
+    // MEM[ADDR] = (tmp == cmp) ? src : tmp;
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_ADD_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_ADD_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_add_x2")
+    {
+        setFlag(AtomicAdd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_ADD_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] += DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_ADD_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SUB_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SUB_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_sub_x2")
+    {
+        setFlag(AtomicSub);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SUB_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umin_x2")
+    {
+        setFlag(AtomicMin);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] < tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMIN_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_smax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (signed compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_SMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_umax_x2")
+    {
+        setFlag(AtomicMax);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] -= (DATA[0:1] > tmp) ? DATA[0:1] : tmp (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_UMAX_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_AND_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_AND_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_and_x2")
+    {
+        setFlag(AtomicAnd);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_AND_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::~Inst_MUBUF__BUFFER_ATOMIC_AND_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_AND_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] &= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_AND_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_OR_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_OR_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_or_x2")
+    {
+        setFlag(AtomicOr);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+    } // Inst_MUBUF__BUFFER_ATOMIC_OR_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::~Inst_MUBUF__BUFFER_ATOMIC_OR_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_OR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] |= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_OR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_XOR_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_XOR_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_xor_x2")
+    {
+        setFlag(AtomicXor);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_XOR_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] ^= DATA[0:1];
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_XOR_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_INC_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_INC_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_inc_x2")
+    {
+        setFlag(AtomicInc);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_INC_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::~Inst_MUBUF__BUFFER_ATOMIC_INC_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_INC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp >= DATA[0:1]) ? 0 : tmp + 1 (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_MUBUF__BUFFER_ATOMIC_DEC_X2 class methods ---
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
+        ::Inst_MUBUF__BUFFER_ATOMIC_DEC_X2(InFmt_MUBUF *iFmt)
+        : Inst_MUBUF(iFmt, "buffer_atomic_dec_x2")
+    {
+        setFlag(AtomicDec);
+        if (instData.GLC) {
+            setFlag(AtomicReturn);
+        } else {
+            setFlag(AtomicNoReturn);
+        }
+        setFlag(MemoryRef);
+        setFlag(GlobalSegment);
+    } // Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
+
+    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2()
+    {
+    } // ~Inst_MUBUF__BUFFER_ATOMIC_DEC_X2
+
+    // --- description from .arch file ---
+    // 64b:
+    // tmp = MEM[ADDR];
+    // MEM[ADDR] = (tmp == 0 || tmp > DATA[0:1]) ? DATA[0:1] : tmp - 1
+    // (unsigned compare);
+    // RETURN_DATA[0:1] = tmp.
+    void
+    Inst_MUBUF__BUFFER_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.cc b/src/arch/amdgpu/vega/insts/op_encodings.cc
index c934094d9b..0b4e894e75 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.cc
+++ b/src/arch/amdgpu/vega/insts/op_encodings.cc
@@ -1178,6 +1178,158 @@ namespace VegaISA
         disassembly = dis_stream.str();
     }
 
+    // --- Inst_VOP3P base class methods ---
+
+    Inst_VOP3P::Inst_VOP3P(InFmt_VOP3P *iFmt, const std::string &opcode)
+        : VEGAGPUStaticInst(opcode)
+    {
+        // copy first instruction DWORD
+        instData = iFmt[0];
+        // copy second instruction DWORD
+        extData = ((InFmt_VOP3P_1 *)iFmt)[1];
+    } // Inst_VOP3P
+
+    Inst_VOP3P::~Inst_VOP3P()
+    {
+    } // ~Inst_VOP3P
+
+    void
+    Inst_VOP3P::initOperandInfo()
+    {
+        // Also takes care of bitfield addr issue
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+
+        int opNum = 0;
+
+        int numSrc = numSrcRegOperands();
+
+        for (opNum = 0; opNum < numSrc; opNum++) {
+            srcOps.emplace_back(srcs[opNum], getOperandSize(opNum), true,
+                                  isScalarReg(srcs[opNum]),
+                                  isVectorReg(srcs[opNum]), false);
+        }
+
+        // There is always one dest
+        // Needed because can't take addr of bitfield
+        int reg = instData.VDST;
+        dstOps.emplace_back(reg, getOperandSize(opNum), false,
+                              false, true, false);
+        opNum++;
+
+        assert(srcOps.size() == numSrcRegOperands());
+        assert(dstOps.size() == numDstRegOperands());
+    }
+
+    int
+    Inst_VOP3P::instSize() const
+    {
+        return 8;
+    } // instSize
+
+    void
+    Inst_VOP3P::generateDisassembly()
+    {
+        std::stringstream dis_stream;
+        dis_stream << _opcode << " ";
+
+        // There is always a dest and the index is after the src operands
+        // The output size much be a multiple of dword size
+        int dst_size = getOperandSize(numSrcRegOperands());
+
+        dis_stream << opSelectorToRegSym(instData.VDST + 0x100, dst_size / 4);
+
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+        for (int opnum = 0; opnum < numSrcRegOperands(); opnum++) {
+            int num_regs = getOperandSize(opnum) / 4;
+            dis_stream << ", " << opSelectorToRegSym(srcs[opnum], num_regs);
+        }
+
+        // Print op_sel only if one is non-zero
+        if (instData.OPSEL) {
+            int opsel = instData.OPSEL;
+
+            dis_stream << " op_sel:[" << bits(opsel, 0, 0) << ","
+                    << bits(opsel, 1, 1) << "," << bits(opsel, 2, 2) << "]";
+        }
+
+        disassembly = dis_stream.str();
+    }
+
+    // --- Inst_VOP3P_MAI base class methods ---
+
+    Inst_VOP3P_MAI::Inst_VOP3P_MAI(InFmt_VOP3P_MAI *iFmt,
+                                   const std::string &opcode)
+        : VEGAGPUStaticInst(opcode)
+    {
+        // copy first instruction DWORD
+        instData = iFmt[0];
+        // copy second instruction DWORD
+        extData = ((InFmt_VOP3P_MAI_1 *)iFmt)[1];
+    } // Inst_VOP3P_MAI
+
+    Inst_VOP3P_MAI::~Inst_VOP3P_MAI()
+    {
+    } // ~Inst_VOP3P_MAI
+
+    void
+    Inst_VOP3P_MAI::initOperandInfo()
+    {
+        // Also takes care of bitfield addr issue
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+
+        int opNum = 0;
+
+        int numSrc = numSrcRegOperands();
+
+        for (opNum = 0; opNum < numSrc; opNum++) {
+            srcOps.emplace_back(srcs[opNum], getOperandSize(opNum), true,
+                                  isScalarReg(srcs[opNum]),
+                                  isVectorReg(srcs[opNum]), false);
+        }
+
+        // There is always one dest
+        // Needed because can't take addr of bitfield
+        int reg = instData.VDST;
+        dstOps.emplace_back(reg, getOperandSize(opNum), false,
+                              false, true, false);
+        opNum++;
+
+        assert(srcOps.size() == numSrcRegOperands());
+        assert(dstOps.size() == numDstRegOperands());
+    }
+
+    int
+    Inst_VOP3P_MAI::instSize() const
+    {
+        return 8;
+    } // instSize
+
+    void
+    Inst_VOP3P_MAI::generateDisassembly()
+    {
+        std::stringstream dis_stream;
+        dis_stream << _opcode << " ";
+
+        // There is always a dest and the index is after the src operands
+        // The output size much be a multiple of dword size
+        int dst_size = getOperandSize(numSrcRegOperands());
+
+        // opSelectorToRegSym handles formating for us. VDST is always VGPR
+        // so only the last 8 bits are used. This adds the implicit 9th bit
+        // which is 1 for VGPRs as VGPR op nums are from 256-255.
+        int dst_opnum = instData.VDST + 0x100;
+
+        dis_stream << opSelectorToRegSym(dst_opnum, dst_size / 4);
+
+        unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2};
+        for (int opnum = 0; opnum < numSrcRegOperands(); opnum++) {
+            int num_regs = getOperandSize(opnum) / 4;
+            dis_stream << ", " << opSelectorToRegSym(srcs[opnum], num_regs);
+        }
+
+        disassembly = dis_stream.str();
+    }
+
     // --- Inst_DS base class methods ---
 
     Inst_DS::Inst_DS(InFmt_DS *iFmt, const std::string &opcode)
@@ -1695,10 +1847,10 @@ namespace VegaISA
         // One of the flat subtypes should be specified via flags
         assert(isFlat() ^ isFlatGlobal() ^ isFlatScratch());
 
-        if (isFlat()) {
-            generateFlatDisassembly();
-        } else if (isFlatGlobal() || isFlatScratch()) {
+        if (isFlatGlobal() || isFlatScratch()) {
             generateGlobalScratchDisassembly();
+        } else if (isFlat()) {
+            generateFlatDisassembly();
         } else {
             panic("Unknown flat subtype!\n");
         }
@@ -1710,13 +1862,19 @@ namespace VegaISA
         std::stringstream dis_stream;
         dis_stream << _opcode << " ";
 
-        if (isLoad())
-            dis_stream << "v" << extData.VDST << ", ";
+        if (isLoad() || isAtomic()) {
+            int dst_size = getOperandSize(numSrcRegOperands()) / 4;
+            dis_stream << opSelectorToRegSym(extData.VDST + 0x100, dst_size)
+                       << ", ";
+        }
 
-        dis_stream << "v[" << extData.ADDR << ":" << extData.ADDR + 1 << "]";
+        dis_stream << opSelectorToRegSym(extData.ADDR + 0x100, 2);
 
-        if (isStore())
-            dis_stream << ", v" << extData.DATA;
+        if (isStore() || isAtomic()) {
+            int src_size = getOperandSize(1) / 4;
+            dis_stream << ", "
+                << opSelectorToRegSym(extData.DATA + 0x100, src_size);
+        }
 
         disassembly = dis_stream.str();
     }
@@ -1736,25 +1894,38 @@ namespace VegaISA
         std::stringstream dis_stream;
         dis_stream << global_opcode << " ";
 
-        if (isLoad())
-            dis_stream << "v" << extData.VDST << ", ";
+        if (isLoad() || isAtomic()) {
+            // dest is the first operand after all the src operands
+            int dst_size = getOperandSize(numSrcRegOperands()) / 4;
+            dis_stream << opSelectorToRegSym(extData.VDST + 0x100, dst_size)
+                       << ", ";
+        }
 
-        if (extData.SADDR == 0x7f)
-            dis_stream << "v[" << extData.ADDR << ":" << extData.ADDR+1 << "]";
-        else
-            dis_stream << "v" << extData.ADDR;
+        if (extData.SADDR == 0x7f) {
+            dis_stream << opSelectorToRegSym(extData.ADDR + 0x100, 2);
+        } else {
+            dis_stream << opSelectorToRegSym(extData.ADDR + 0x100, 1);
+        }
 
-        if (isStore())
-            dis_stream << ", v" << extData.DATA;
+        if (isStore() || isAtomic()) {
+            int src_size = getOperandSize(1) / 4;
+            dis_stream << ", "
+                << opSelectorToRegSym(extData.DATA + 0x100, src_size);
+        }
 
-        if (extData.SADDR == 0x7f)
+        if (extData.SADDR == 0x7f) {
             dis_stream << ", off";
-        else
-            dis_stream << ", s[" << extData.SADDR << ":" << extData.SADDR+1
-                       << "]";
+        } else {
+            dis_stream << ", " << opSelectorToRegSym(extData.SADDR, 2);
+        }
 
-        if (instData.OFFSET)
+        if (instData.OFFSET) {
             dis_stream << " offset:" << instData.OFFSET;
+        }
+
+        if (instData.GLC) {
+            dis_stream << " glc";
+        }
 
         disassembly = dis_stream.str();
     }
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index a1c5e99c91..504946534f 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -455,6 +455,29 @@ namespace VegaISA
         // second instruction DWORD
         InFmt_VOP3_1 extData;
 
+        // Output modifier for VOP3 instructions. This 2-bit field can be set
+        // to "0" to do nothing, "1" to multiply output value by 2, "2" to
+        // multiply output value by 4, or "3" to divide output value by 2. If
+        // the instruction supports clamping, this is applied *before* clamp
+        // but after the abs and neg modifiers.
+        template<typename T>
+        T omodModifier(T val, unsigned omod)
+        {
+            assert(omod < 4);
+
+            if constexpr (std::is_floating_point_v<T>) {
+                if (omod == 1) return val * T(2.0f);
+                if (omod == 2) return val * T(4.0f);
+                if (omod == 3) return val / T(2.0f);
+            } else {
+                assert(std::is_integral_v<T>);
+                if (omod == 1) return val * T(2);
+                if (omod == 2) return val * T(4);
+                if (omod == 3) return val / T(2);
+            }
+
+            return val;
+        }
       private:
         bool hasSecondDword(InFmt_VOP3A *);
         /**
@@ -491,6 +514,199 @@ namespace VegaISA
         bool hasSecondDword(InFmt_VOP3B *);
     }; // Inst_VOP3B
 
+    class Inst_VOP3P : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VOP3P(InFmt_VOP3P*, const std::string &opcode);
+        ~Inst_VOP3P();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        void initOperandInfo() override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VOP3P instData;
+        // second instruction DWORD
+        InFmt_VOP3P_1 extData;
+
+        template<typename T>
+        void vop3pHelper(GPUDynInstPtr gpuDynInst,
+                        T (*fOpImpl)(T, T, bool))
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+            ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
+            ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
+            VecOperandU32 D(gpuDynInst, instData.VDST);
+
+            S0.readSrc();
+            S1.readSrc();
+
+            int opLo = instData.OPSEL;
+            int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
+            int negLo = extData.NEG;
+            int negHi = instData.NEG_HI;
+            bool clamp = instData.CLMP;
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
+                                          word<T>(S1[lane], opHi, negHi, 1),
+                                          clamp);
+                    T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
+                                          word<T>(S1[lane], opLo, negLo, 1),
+                                          clamp);
+
+                    uint16_t upper_raw =
+                        *reinterpret_cast<uint16_t*>(&upper_val);
+                    uint16_t lower_raw =
+                        *reinterpret_cast<uint16_t*>(&lower_val);
+
+                    D[lane] = upper_raw << 16 | lower_raw;
+                }
+            }
+
+            D.write();
+        }
+
+        template<typename T>
+        void vop3pHelper(GPUDynInstPtr gpuDynInst,
+                        T (*fOpImpl)(T, T, T, bool))
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+            ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
+            ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
+            ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
+            VecOperandU32 D(gpuDynInst, instData.VDST);
+
+            S0.readSrc();
+            S1.readSrc();
+            S2.readSrc();
+
+            int opLo = instData.OPSEL;
+            int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
+            int negLo = extData.NEG;
+            int negHi = instData.NEG_HI;
+            bool clamp = instData.CLMP;
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
+                                          word<T>(S1[lane], opHi, negHi, 1),
+                                          word<T>(S2[lane], opHi, negHi, 2),
+                                          clamp);
+                    T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
+                                          word<T>(S1[lane], opLo, negLo, 1),
+                                          word<T>(S2[lane], opLo, negLo, 2),
+                                          clamp);
+
+                    uint16_t upper_raw =
+                        *reinterpret_cast<uint16_t*>(&upper_val);
+                    uint16_t lower_raw =
+                        *reinterpret_cast<uint16_t*>(&lower_val);
+
+                    D[lane] = upper_raw << 16 | lower_raw;
+                }
+            }
+
+            D.write();
+        }
+
+        void
+        dotHelper(GPUDynInstPtr gpuDynInst,
+                  uint32_t (*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+            ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
+            ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
+            ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
+            VecOperandU32 D(gpuDynInst, instData.VDST);
+
+            S0.readSrc();
+            S1.readSrc();
+            S2.readSrc();
+
+            // OPSEL[2] and OPSEL_HI2 are unused. Craft two dwords where:
+            // dword1[15:0]  is upper/lower 16b of src0 based on opsel[0]
+            // dword1[31:15] is upper/lower 16b of src0 based on opsel_hi[0]
+            // dword2[15:0]  is upper/lower 16b of src1 based on opsel[1]
+            // dword2[31:15] is upper/lower 16b of src1 based on opsel_hi[1]
+            int opLo = instData.OPSEL;
+            int opHi = extData.OPSEL_HI;
+            int negLo = extData.NEG;
+            int negHi = instData.NEG_HI;
+            bool clamp = instData.CLMP;
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    uint32_t dword1l =
+                        word<uint16_t>(S0[lane], opLo, negLo, 0);
+                    uint32_t dword1h =
+                        word<uint16_t>(S0[lane], opHi, negHi, 0);
+                    uint32_t dword2l =
+                        word<uint16_t>(S1[lane], opLo, negLo, 1);
+                    uint32_t dword2h =
+                        word<uint16_t>(S1[lane], opHi, negHi, 1);
+
+                    uint32_t dword1 = (dword1h << 16) | dword1l;
+                    uint32_t dword2 = (dword2h << 16) | dword2l;
+
+                    // Take in two uint32_t dwords and one src2 dword. The
+                    // function will need to call bits to break up to the
+                    // correct size and then reinterpret cast to the correct
+                    // value.
+                    D[lane] = fOpImpl(dword1, dword2, S2[lane], clamp);
+                }
+            }
+
+            D.write();
+        }
+
+      private:
+        bool hasSecondDword(InFmt_VOP3P *);
+
+        template<typename T>
+        T
+        word(uint32_t data, int opSel, int neg, int opSelBit)
+        {
+            // This method assumes two words packed into a dword
+            static_assert(sizeof(T) == 2);
+
+            bool select = bits(opSel, opSelBit, opSelBit);
+            uint16_t raw = select ? bits(data, 31, 16)
+                                  : bits(data, 15, 0);
+
+            // Apply input modifiers. This may seem odd, but the hardware
+            // just flips the MSb instead of doing unary negation.
+            bool negate = bits(neg, opSelBit, opSelBit);
+            if (negate) {
+                raw ^= 0x8000;
+            }
+
+            return *reinterpret_cast<T*>(&raw);
+        }
+    }; // Inst_VOP3P
+
+    class Inst_VOP3P_MAI : public VEGAGPUStaticInst
+    {
+      public:
+        Inst_VOP3P_MAI(InFmt_VOP3P_MAI*, const std::string &opcode);
+        ~Inst_VOP3P_MAI();
+
+        int instSize() const override;
+        void generateDisassembly() override;
+
+        void initOperandInfo() override;
+
+      protected:
+        // first instruction DWORD
+        InFmt_VOP3P_MAI instData;
+        // second instruction DWORD
+        InFmt_VOP3P_MAI_1 extData;
+
+      private:
+        bool hasSecondDword(InFmt_VOP3P_MAI *);
+    }; // Inst_VOP3P
+
     class Inst_DS : public VEGAGPUStaticInst
     {
       public:
@@ -1065,13 +1281,12 @@ namespace VegaISA
             // If saddr = 0x7f there is no scalar reg to read and address will
             // be a 64-bit address. Otherwise, saddr is the reg index for a
             // scalar reg used as the base address for a 32-bit address.
-            if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
-                || isFlat()) {
+            if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
                 ConstVecOperandU64 vbase(gpuDynInst, vaddr);
                 vbase.read();
 
                 calcAddrVgpr(gpuDynInst, vbase, offset);
-            } else {
+            } else if (isFlatGlobal()) {
                 // Assume we are operating in 64-bit mode and read a pair of
                 // SGPRs for the address base.
                 ConstScalarOperandU64 sbase(gpuDynInst, saddr);
@@ -1081,6 +1296,68 @@ namespace VegaISA
                 voffset.read();
 
                 calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
+            // For scratch, saddr = 0x7f there is no scalar reg to read and
+            // a vgpr will be used for address offset. Otherwise, saddr is
+            // the sgpr index holding the address offset. For scratch
+            // instructions the offset GPR is always 32-bits.
+            } else if (saddr != 0x7f) {
+                assert(isFlatScratch());
+
+                ConstScalarOperandU32 soffset(gpuDynInst, saddr);
+                soffset.read();
+
+                ConstVecOperandU32 voffset(gpuDynInst, vaddr);
+                if (instData.SVE) {
+                    voffset.read();
+                }
+
+                Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
+
+                int elemSize;
+                auto staticInst = gpuDynInst->staticInstruction();
+                if (gpuDynInst->isLoad()) {
+                    elemSize = staticInst->getOperandSize(2);
+                } else {
+                    assert(gpuDynInst->isStore());
+                    elemSize = staticInst->getOperandSize(1);
+                }
+
+                unsigned swizzleOffset = soffset.rawData() + offset;
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        swizzleOffset += instData.SVE ? voffset[lane] : 0;
+                        gpuDynInst->addr.at(lane) = flat_scratch_addr
+                            + swizzle(swizzleOffset, lane, elemSize);
+                    }
+                }
+            } else {
+                assert(isFlatScratch());
+
+                ConstVecOperandU32 voffset(gpuDynInst, vaddr);
+                if (instData.SVE) {
+                    voffset.read();
+                }
+
+                Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
+
+                int elemSize;
+                auto staticInst = gpuDynInst->staticInstruction();
+                if (gpuDynInst->isLoad()) {
+                    elemSize = staticInst->getOperandSize(2);
+                } else {
+                    assert(gpuDynInst->isStore());
+                    elemSize = staticInst->getOperandSize(1);
+                }
+
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        VecElemU32 vgpr_offset =
+                            instData.SVE ? voffset[lane] : 0;
+
+                        gpuDynInst->addr.at(lane) = flat_scratch_addr
+                            + swizzle(vgpr_offset + offset, lane, elemSize);
+                    }
+                }
             }
 
             if (isFlat()) {
@@ -1092,6 +1369,7 @@ namespace VegaISA
                 assert(isFlatScratch());
                 gpuDynInst->staticInstruction()->executed_as =
                     enums::SC_PRIVATE;
+                gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
             }
         }
 
@@ -1113,6 +1391,77 @@ namespace VegaISA
             }
         }
 
+        // Execute for atomics is identical besides the flag set in the
+        // constructor, except cmpswap. For cmpswap, the offset to the "cmp"
+        // register is needed. For all other operations this offset is zero
+        // and implies the atomic is not a cmpswap.
+        // RegT defines the type of GPU register (e.g., ConstVecOperandU32)
+        // LaneT defines the type of the register elements (e.g., VecElemU32)
+        template<typename RegT, typename LaneT, int CmpRegOffset = 0>
+        void
+        atomicExecute(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+
+            if (gpuDynInst->exec_mask.none()) {
+                wf->decVMemInstsIssued();
+                if (isFlat()) {
+                    wf->decLGKMInstsIssued();
+                }
+                return;
+            }
+
+            gpuDynInst->execUnitId = wf->execUnitId;
+            gpuDynInst->latency.init(gpuDynInst->computeUnit());
+            gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+            RegT data(gpuDynInst, extData.DATA);
+            RegT cmp(gpuDynInst, extData.DATA + CmpRegOffset);
+
+            data.read();
+            if constexpr (CmpRegOffset) {
+                cmp.read();
+            }
+
+            calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    if constexpr (CmpRegOffset) {
+                        (reinterpret_cast<VecElemU32*>(
+                            gpuDynInst->x_data))[lane] = data[lane];
+                        (reinterpret_cast<VecElemU32*>(
+                            gpuDynInst->a_data))[lane] = cmp[lane];
+                    } else {
+                        (reinterpret_cast<LaneT*>(gpuDynInst->a_data))[lane]
+                            = data[lane];
+                    }
+                }
+            }
+
+            issueRequestHelper(gpuDynInst);
+        }
+
+        // RegT defines the type of GPU register (e.g., ConstVecOperandU32)
+        // LaneT defines the type of the register elements (e.g., VecElemU32)
+        template<typename RegT, typename LaneT>
+        void
+        atomicComplete(GPUDynInstPtr gpuDynInst)
+        {
+            if (isAtomicRet()) {
+                RegT vdst(gpuDynInst, extData.VDST);
+
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        vdst[lane] = (reinterpret_cast<LaneT*>(
+                            gpuDynInst->d_data))[lane];
+                    }
+                }
+
+                vdst.write();
+            }
+        }
+
         bool
         vgprIsOffset()
         {
@@ -1157,6 +1506,23 @@ namespace VegaISA
                 }
             }
         }
+
+        VecElemU32
+        swizzle(VecElemU32 offset, int lane, int elem_size)
+        {
+            // This is not described in the spec. We use the swizzle from
+            // buffer memory instructions and fix the stride to 4. Multiply
+            // the thread ID by the storage size to avoid threads clobbering
+            // their data.
+            return ((offset / 4) * 4 * 64)
+                + (offset % 4) + (lane * elem_size);
+        }
+
+        Addr
+        readFlatScratch(GPUDynInstPtr gpuDynInst)
+        {
+            return gpuDynInst->computeUnit()->shader->getScratchBase();
+        }
     }; // Inst_FLAT
 } // namespace VegaISA
 } // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/smem.cc b/src/arch/amdgpu/vega/insts/smem.cc
new file mode 100644
index 0000000000..a6af4f007d
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/smem.cc
@@ -0,0 +1,1013 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SMEM__S_LOAD_DWORD class methods ---
+
+    Inst_SMEM__S_LOAD_DWORD::Inst_SMEM__S_LOAD_DWORD(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORD
+
+    Inst_SMEM__S_LOAD_DWORD::~Inst_SMEM__S_LOAD_DWORD()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORD
+
+    /**
+     * Read 1 dword from scalar data cache. If the offset is specified as an
+     * sgpr, the sgpr contains an unsigned byte offset (the 2 LSBs are
+     * ignored). If the offset is specified as an immediate 20-bit constant,
+     * the constant is an unsigned byte offset.
+     */
+    void
+    Inst_SMEM__S_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<1>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX2 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX2::Inst_SMEM__S_LOAD_DWORDX2(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX2
+
+    Inst_SMEM__S_LOAD_DWORDX2::~Inst_SMEM__S_LOAD_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX2
+
+    /**
+     * Read 2 dwords from scalar data cache. See s_load_dword for details on
+     * the offset input.
+     */
+    void
+    Inst_SMEM__S_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX4 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX4::Inst_SMEM__S_LOAD_DWORDX4(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX4
+
+    Inst_SMEM__S_LOAD_DWORDX4::~Inst_SMEM__S_LOAD_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX8 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX8::Inst_SMEM__S_LOAD_DWORDX8(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX8
+
+    Inst_SMEM__S_LOAD_DWORDX8::~Inst_SMEM__S_LOAD_DWORDX8()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX8
+
+    // --- description from .arch file ---
+    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_LOAD_DWORDX16 class methods ---
+
+    Inst_SMEM__S_LOAD_DWORDX16::Inst_SMEM__S_LOAD_DWORDX16(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_load_dwordx16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_LOAD_DWORDX16
+
+    Inst_SMEM__S_LOAD_DWORDX16::~Inst_SMEM__S_LOAD_DWORDX16()
+    {
+    } // ~Inst_SMEM__S_LOAD_DWORDX16
+
+    // --- description from .arch file ---
+    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+
+        addr.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORD class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::Inst_SMEM__S_BUFFER_LOAD_DWORD(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORD
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::~Inst_SMEM__S_BUFFER_LOAD_DWORD()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORD
+
+    // --- description from .arch file ---
+    // Read 1 dword from scalar data cache. See S_LOAD_DWORD for details on the
+    // ---  offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<1>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 1 request, size 32
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX2 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::Inst_SMEM__S_BUFFER_LOAD_DWORDX2(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX2
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::~Inst_SMEM__S_BUFFER_LOAD_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX2
+
+    // --- description from .arch file ---
+    // Read 2 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // use U64 because 2 requests, each size 32
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX4 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::Inst_SMEM__S_BUFFER_LOAD_DWORDX4(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX4
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::~Inst_SMEM__S_BUFFER_LOAD_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX4
+
+    // --- description from .arch file ---
+    // Read 4 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 4 requests, each size 32
+        ScalarOperandU128 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX8 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::Inst_SMEM__S_BUFFER_LOAD_DWORDX8(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx8")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX8
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::~Inst_SMEM__S_BUFFER_LOAD_DWORDX8()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX8
+
+    // --- description from .arch file ---
+    // Read 8 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<8>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX8::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 8 requests, each size 32
+        ScalarOperandU256 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_LOAD_DWORDX16 class methods ---
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::Inst_SMEM__S_BUFFER_LOAD_DWORDX16(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_load_dwordx16")
+    {
+        setFlag(MemoryRef);
+        setFlag(Load);
+    } // Inst_SMEM__S_BUFFER_LOAD_DWORDX16
+
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::~Inst_SMEM__S_BUFFER_LOAD_DWORDX16()
+    {
+    } // ~Inst_SMEM__S_BUFFER_LOAD_DWORDX16
+
+    // --- description from .arch file ---
+    // Read 16 dwords from scalar data cache. See S_LOAD_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU128 rsrcDesc(gpuDynInst, instData.SBASE);
+
+        rsrcDesc.read();
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, rsrcDesc, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe
+            .issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemRead<16>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_LOAD_DWORDX16::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        // 16 requests, each size 32
+        ScalarOperandU512 sdst(gpuDynInst, instData.SDATA);
+        sdst.write();
+    } // completeAcc
+    // --- Inst_SMEM__S_STORE_DWORD class methods ---
+
+    Inst_SMEM__S_STORE_DWORD::Inst_SMEM__S_STORE_DWORD(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_STORE_DWORD
+
+    Inst_SMEM__S_STORE_DWORD::~Inst_SMEM__S_STORE_DWORD()
+    {
+    } // ~Inst_SMEM__S_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Write 1 dword to scalar data cache.
+    // If the offset is specified as an SGPR, the SGPR contains an unsigned
+    // BYTE offset (the 2 LSBs are ignored).
+    // If the offset is specified as an immediate 20-bit constant, the
+    // constant is an unsigned BYTE offset.
+    void
+    Inst_SMEM__S_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU32 sdata(gpuDynInst, instData.SDATA);
+
+        addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            sizeof(ScalarRegU32));
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<1>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_STORE_DWORDX2 class methods ---
+
+    Inst_SMEM__S_STORE_DWORDX2::Inst_SMEM__S_STORE_DWORDX2(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_STORE_DWORDX2
+
+    Inst_SMEM__S_STORE_DWORDX2::~Inst_SMEM__S_STORE_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
+
+        addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            sizeof(ScalarRegU64));
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<2>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_STORE_DWORDX4 class methods ---
+
+    Inst_SMEM__S_STORE_DWORDX4::Inst_SMEM__S_STORE_DWORDX4(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_STORE_DWORDX4
+
+    Inst_SMEM__S_STORE_DWORDX4::~Inst_SMEM__S_STORE_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+        ScalarRegU32 offset(0);
+        ConstScalarOperandU64 addr(gpuDynInst, instData.SBASE << 1);
+        ConstScalarOperandU64 sdata(gpuDynInst, instData.SDATA);
+
+        addr.read();
+        sdata.read();
+
+        std::memcpy((void*)gpuDynInst->scalar_data, sdata.rawDataPtr(),
+            sizeof(gpuDynInst->scalar_data));
+
+        if (instData.IMM) {
+            offset = extData.OFFSET;
+        } else {
+            ConstScalarOperandU32 off_sgpr(gpuDynInst, extData.OFFSET);
+            off_sgpr.read();
+            offset = off_sgpr.rawData();
+        }
+
+        calcAddr(gpuDynInst, addr, offset);
+
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            issueRequest(gpuDynInst);
+    } // execute
+
+    void
+    Inst_SMEM__S_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        initMemWrite<4>(gpuDynInst);
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_STORE_DWORD class methods ---
+
+    Inst_SMEM__S_BUFFER_STORE_DWORD::Inst_SMEM__S_BUFFER_STORE_DWORD(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_store_dword")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_BUFFER_STORE_DWORD
+
+    Inst_SMEM__S_BUFFER_STORE_DWORD::~Inst_SMEM__S_BUFFER_STORE_DWORD()
+    {
+    } // ~Inst_SMEM__S_BUFFER_STORE_DWORD
+
+    // --- description from .arch file ---
+    // Write 1 dword to scalar data cache. See S_STORE_DWORD for details on the
+    // ---  offset input.
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORD::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORD::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORD::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX2 class methods ---
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::Inst_SMEM__S_BUFFER_STORE_DWORDX2(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_store_dwordx2")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_BUFFER_STORE_DWORDX2
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::~Inst_SMEM__S_BUFFER_STORE_DWORDX2()
+    {
+    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX2
+
+    // --- description from .arch file ---
+    // Write 2 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_BUFFER_STORE_DWORDX4 class methods ---
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::Inst_SMEM__S_BUFFER_STORE_DWORDX4(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_buffer_store_dwordx4")
+    {
+        setFlag(MemoryRef);
+        setFlag(Store);
+    } // Inst_SMEM__S_BUFFER_STORE_DWORDX4
+
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::~Inst_SMEM__S_BUFFER_STORE_DWORDX4()
+    {
+    } // ~Inst_SMEM__S_BUFFER_STORE_DWORDX4
+
+    // --- description from .arch file ---
+    // Write 4 dwords to scalar data cache. See S_STORE_DWORD for details on
+    // the offset input.
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // initiateAcc
+
+    void
+    Inst_SMEM__S_BUFFER_STORE_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+    // --- Inst_SMEM__S_DCACHE_INV class methods ---
+
+    Inst_SMEM__S_DCACHE_INV::Inst_SMEM__S_DCACHE_INV(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_inv")
+    {
+    } // Inst_SMEM__S_DCACHE_INV
+
+    Inst_SMEM__S_DCACHE_INV::~Inst_SMEM__S_DCACHE_INV()
+    {
+    } // ~Inst_SMEM__S_DCACHE_INV
+
+    // --- description from .arch file ---
+    // Invalidate the scalar data cache.
+    void
+    Inst_SMEM__S_DCACHE_INV::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_DCACHE_WB class methods ---
+
+    Inst_SMEM__S_DCACHE_WB::Inst_SMEM__S_DCACHE_WB(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_wb")
+    {
+    } // Inst_SMEM__S_DCACHE_WB
+
+    Inst_SMEM__S_DCACHE_WB::~Inst_SMEM__S_DCACHE_WB()
+    {
+    } // ~Inst_SMEM__S_DCACHE_WB
+
+    // --- description from .arch file ---
+    // Write back dirty data in the scalar data cache.
+    void
+    Inst_SMEM__S_DCACHE_WB::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_DCACHE_INV_VOL class methods ---
+
+    Inst_SMEM__S_DCACHE_INV_VOL::Inst_SMEM__S_DCACHE_INV_VOL(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_inv_vol")
+    {
+    } // Inst_SMEM__S_DCACHE_INV_VOL
+
+    Inst_SMEM__S_DCACHE_INV_VOL::~Inst_SMEM__S_DCACHE_INV_VOL()
+    {
+    } // ~Inst_SMEM__S_DCACHE_INV_VOL
+
+    // --- description from .arch file ---
+    // Invalidate the scalar data cache volatile lines.
+    void
+    Inst_SMEM__S_DCACHE_INV_VOL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_DCACHE_WB_VOL class methods ---
+
+    Inst_SMEM__S_DCACHE_WB_VOL::Inst_SMEM__S_DCACHE_WB_VOL(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_dcache_wb_vol")
+    {
+    } // Inst_SMEM__S_DCACHE_WB_VOL
+
+    Inst_SMEM__S_DCACHE_WB_VOL::~Inst_SMEM__S_DCACHE_WB_VOL()
+    {
+    } // ~Inst_SMEM__S_DCACHE_WB_VOL
+
+    // --- description from .arch file ---
+    // Write back dirty data in the scalar data cache volatile lines.
+    void
+    Inst_SMEM__S_DCACHE_WB_VOL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_MEMTIME class methods ---
+
+    Inst_SMEM__S_MEMTIME::Inst_SMEM__S_MEMTIME(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_memtime")
+    {
+        // s_memtime does not issue a memory request
+        setFlag(ALU);
+    } // Inst_SMEM__S_MEMTIME
+
+    Inst_SMEM__S_MEMTIME::~Inst_SMEM__S_MEMTIME()
+    {
+    } // ~Inst_SMEM__S_MEMTIME
+
+    // --- description from .arch file ---
+    // Return current 64-bit timestamp.
+    void
+    Inst_SMEM__S_MEMTIME::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDATA);
+        sdst = (ScalarRegU64)gpuDynInst->computeUnit()->curCycle();
+        sdst.write();
+    } // execute
+    // --- Inst_SMEM__S_MEMREALTIME class methods ---
+
+    Inst_SMEM__S_MEMREALTIME::Inst_SMEM__S_MEMREALTIME(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_memrealtime")
+    {
+    } // Inst_SMEM__S_MEMREALTIME
+
+    Inst_SMEM__S_MEMREALTIME::~Inst_SMEM__S_MEMREALTIME()
+    {
+    } // ~Inst_SMEM__S_MEMREALTIME
+
+    // --- description from .arch file ---
+    // Return current 64-bit RTC.
+    void
+    Inst_SMEM__S_MEMREALTIME::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_ATC_PROBE class methods ---
+
+    Inst_SMEM__S_ATC_PROBE::Inst_SMEM__S_ATC_PROBE(InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_atc_probe")
+    {
+    } // Inst_SMEM__S_ATC_PROBE
+
+    Inst_SMEM__S_ATC_PROBE::~Inst_SMEM__S_ATC_PROBE()
+    {
+    } // ~Inst_SMEM__S_ATC_PROBE
+
+    // --- description from .arch file ---
+    // Probe or prefetch an address into the SQC data cache.
+    void
+    Inst_SMEM__S_ATC_PROBE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SMEM__S_ATC_PROBE_BUFFER class methods ---
+
+    Inst_SMEM__S_ATC_PROBE_BUFFER::Inst_SMEM__S_ATC_PROBE_BUFFER(
+          InFmt_SMEM *iFmt)
+        : Inst_SMEM(iFmt, "s_atc_probe_buffer")
+    {
+    } // Inst_SMEM__S_ATC_PROBE_BUFFER
+
+    Inst_SMEM__S_ATC_PROBE_BUFFER::~Inst_SMEM__S_ATC_PROBE_BUFFER()
+    {
+    } // ~Inst_SMEM__S_ATC_PROBE_BUFFER
+
+    // --- description from .arch file ---
+    // Probe or prefetch an address into the SQC data cache.
+    void
+    Inst_SMEM__S_ATC_PROBE_BUFFER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sop1.cc b/src/arch/amdgpu/vega/insts/sop1.cc
new file mode 100644
index 0000000000..fa9a103e39
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sop1.cc
@@ -0,0 +1,1504 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOP1__S_MOV_B32 class methods ---
+
+    Inst_SOP1__S_MOV_B32::Inst_SOP1__S_MOV_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_mov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOV_B32
+
+    Inst_SOP1__S_MOV_B32::~Inst_SOP1__S_MOV_B32()
+    {
+    } // ~Inst_SOP1__S_MOV_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u.
+    void
+    Inst_SOP1__S_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOV_B64 class methods ---
+
+    Inst_SOP1__S_MOV_B64::Inst_SOP1__S_MOV_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_mov_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOV_B64
+
+    Inst_SOP1__S_MOV_B64::~Inst_SOP1__S_MOV_B64()
+    {
+    } // ~Inst_SOP1__S_MOV_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64.
+    void
+    Inst_SOP1__S_MOV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_CMOV_B32 class methods ---
+
+    Inst_SOP1__S_CMOV_B32::Inst_SOP1__S_CMOV_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_cmov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_CMOV_B32
+
+    Inst_SOP1__S_CMOV_B32::~Inst_SOP1__S_CMOV_B32()
+    {
+    } // ~Inst_SOP1__S_CMOV_B32
+
+    // --- description from .arch file ---
+    // (SCC) then D.u = S0.u;
+    // else NOP.
+    // Conditional move.
+    void
+    Inst_SOP1__S_CMOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+        scc.read();
+
+        if (scc.rawData()) {
+            sdst = src.rawData();
+            sdst.write();
+        }
+    } // execute
+    // --- Inst_SOP1__S_CMOV_B64 class methods ---
+
+    Inst_SOP1__S_CMOV_B64::Inst_SOP1__S_CMOV_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_cmov_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_CMOV_B64
+
+    Inst_SOP1__S_CMOV_B64::~Inst_SOP1__S_CMOV_B64()
+    {
+    } // ~Inst_SOP1__S_CMOV_B64
+
+    // --- description from .arch file ---
+    // if (SCC) then D.u64 = S0.u64;
+    // else NOP.
+    // Conditional move.
+    void
+    Inst_SOP1__S_CMOV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+        scc.read();
+
+        if (scc.rawData()) {
+            sdst = src.rawData();
+            sdst.write();
+        }
+    } // execute
+    // --- Inst_SOP1__S_NOT_B32 class methods ---
+
+    Inst_SOP1__S_NOT_B32::Inst_SOP1__S_NOT_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_not_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_NOT_B32
+
+    Inst_SOP1__S_NOT_B32::~Inst_SOP1__S_NOT_B32()
+    {
+    } // ~Inst_SOP1__S_NOT_B32
+
+    // --- description from .arch file ---
+    // D.u = ~S0.u;
+    // SCC = 1 if result is non-zero.
+    // Bitwise negation.
+    void
+    Inst_SOP1__S_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = ~src.rawData();
+
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_NOT_B64 class methods ---
+
+    Inst_SOP1__S_NOT_B64::Inst_SOP1__S_NOT_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_not_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_NOT_B64
+
+    Inst_SOP1__S_NOT_B64::~Inst_SOP1__S_NOT_B64()
+    {
+    } // ~Inst_SOP1__S_NOT_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~S0.u64;
+    // SCC = 1 if result is non-zero.
+    // Bitwise negation.
+    void
+    Inst_SOP1__S_NOT_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = ~src.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_WQM_B32 class methods ---
+
+    Inst_SOP1__S_WQM_B32::Inst_SOP1__S_WQM_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_wqm_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_WQM_B32
+
+    Inst_SOP1__S_WQM_B32::~Inst_SOP1__S_WQM_B32()
+    {
+    } // ~Inst_SOP1__S_WQM_B32
+
+    // --- description from .arch file ---
+    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
+    // Computes whole quad mode for an active/valid mask.
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_WQM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wholeQuadMode(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_WQM_B64 class methods ---
+
+    Inst_SOP1__S_WQM_B64::Inst_SOP1__S_WQM_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_wqm_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_WQM_B64
+
+    Inst_SOP1__S_WQM_B64::~Inst_SOP1__S_WQM_B64()
+    {
+    } // ~Inst_SOP1__S_WQM_B64
+
+    // --- description from .arch file ---
+    // D[i] = (S0[(i & ~3):(i | 3)] != 0);
+    // Computes whole quad mode for an active/valid mask.
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_WQM_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wholeQuadMode(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BREV_B32 class methods ---
+
+    Inst_SOP1__S_BREV_B32::Inst_SOP1__S_BREV_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_brev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BREV_B32
+
+    Inst_SOP1__S_BREV_B32::~Inst_SOP1__S_BREV_B32()
+    {
+    } // ~Inst_SOP1__S_BREV_B32
+
+    // --- description from .arch file ---
+    // D.u[31:0] = S0.u[0:31] (reverse bits).
+    void
+    Inst_SOP1__S_BREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = reverseBits(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BREV_B64 class methods ---
+
+    Inst_SOP1__S_BREV_B64::Inst_SOP1__S_BREV_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_brev_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BREV_B64
+
+    Inst_SOP1__S_BREV_B64::~Inst_SOP1__S_BREV_B64()
+    {
+    } // ~Inst_SOP1__S_BREV_B64
+
+    // --- description from .arch file ---
+    // D.u64[63:0] = S0.u64[0:63] (reverse bits).
+    void
+    Inst_SOP1__S_BREV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = reverseBits(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT0_I32_B32 class methods ---
+
+    Inst_SOP1__S_BCNT0_I32_B32::Inst_SOP1__S_BCNT0_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt0_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT0_I32_B32
+
+    Inst_SOP1__S_BCNT0_I32_B32::~Inst_SOP1__S_BCNT0_I32_B32()
+    {
+    } // ~Inst_SOP1__S_BCNT0_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = CountZeroBits(S0.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = countZeroBits(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT0_I32_B64 class methods ---
+
+    Inst_SOP1__S_BCNT0_I32_B64::Inst_SOP1__S_BCNT0_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt0_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT0_I32_B64
+
+    Inst_SOP1__S_BCNT0_I32_B64::~Inst_SOP1__S_BCNT0_I32_B64()
+    {
+    } // ~Inst_SOP1__S_BCNT0_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = CountZeroBits(S0.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = countZeroBits(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT1_I32_B32 class methods ---
+
+    Inst_SOP1__S_BCNT1_I32_B32::Inst_SOP1__S_BCNT1_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt1_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT1_I32_B32
+
+    Inst_SOP1__S_BCNT1_I32_B32::~Inst_SOP1__S_BCNT1_I32_B32()
+    {
+    } // ~Inst_SOP1__S_BCNT1_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = CountOneBits(S0.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = popCount(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_BCNT1_I32_B64 class methods ---
+
+    Inst_SOP1__S_BCNT1_I32_B64::Inst_SOP1__S_BCNT1_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bcnt1_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BCNT1_I32_B64
+
+    Inst_SOP1__S_BCNT1_I32_B64::~Inst_SOP1__S_BCNT1_I32_B64()
+    {
+    } // ~Inst_SOP1__S_BCNT1_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = CountOneBits(S0.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_BCNT1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = popCount(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_FF0_I32_B32 class methods ---
+
+    Inst_SOP1__S_FF0_I32_B32::Inst_SOP1__S_FF0_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff0_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF0_I32_B32
+
+    Inst_SOP1__S_FF0_I32_B32::~Inst_SOP1__S_FF0_I32_B32()
+    {
+    } // ~Inst_SOP1__S_FF0_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = FindFirstZero(S0.u);
+    // If no zeros are found, return -1.
+    // Returns the bit position of the first zero from the LSB.
+    void
+    Inst_SOP1__S_FF0_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstZero(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FF0_I32_B64 class methods ---
+
+    Inst_SOP1__S_FF0_I32_B64::Inst_SOP1__S_FF0_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff0_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF0_I32_B64
+
+    Inst_SOP1__S_FF0_I32_B64::~Inst_SOP1__S_FF0_I32_B64()
+    {
+    } // ~Inst_SOP1__S_FF0_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = FindFirstZero(S0.u64);
+    // If no zeros are found, return -1.
+    // Returns the bit position of the first zero from the LSB.
+    void
+    Inst_SOP1__S_FF0_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstZero(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FF1_I32_B32 class methods ---
+
+    Inst_SOP1__S_FF1_I32_B32::Inst_SOP1__S_FF1_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff1_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF1_I32_B32
+
+    Inst_SOP1__S_FF1_I32_B32::~Inst_SOP1__S_FF1_I32_B32()
+    {
+    } // ~Inst_SOP1__S_FF1_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u);
+    // If no ones are found, return -1.
+    // Returns the bit position of the first one from the LSB.
+    void
+    Inst_SOP1__S_FF1_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstOne(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FF1_I32_B64 class methods ---
+
+    Inst_SOP1__S_FF1_I32_B64::Inst_SOP1__S_FF1_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_ff1_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FF1_I32_B64
+
+    Inst_SOP1__S_FF1_I32_B64::~Inst_SOP1__S_FF1_I32_B64()
+    {
+    } // ~Inst_SOP1__S_FF1_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u64);
+    // If no ones are found, return -1.
+    // Returns the bit position of the first one from the LSB.
+    void
+    Inst_SOP1__S_FF1_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = findFirstOne(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32_B32 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32_B32::Inst_SOP1__S_FLBIT_I32_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32_B32
+
+    Inst_SOP1__S_FLBIT_I32_B32::~Inst_SOP1__S_FLBIT_I32_B32()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32_B32
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u);
+    // If no ones are found, return -1.
+    // Counts how many zeros before the first one starting from the MSB.
+    void
+    Inst_SOP1__S_FLBIT_I32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = countZeroBitsMsb(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32_B64 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32_B64::Inst_SOP1__S_FLBIT_I32_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32_B64
+
+    Inst_SOP1__S_FLBIT_I32_B64::~Inst_SOP1__S_FLBIT_I32_B64()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32_B64
+
+    // --- description from .arch file ---
+    // D.i = FindFirstOne(S0.u64);
+    // If no ones are found, return -1.
+    // Counts how many zeros before the first one starting from the MSB.
+    void
+    Inst_SOP1__S_FLBIT_I32_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = countZeroBitsMsb(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32::Inst_SOP1__S_FLBIT_I32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32
+
+    Inst_SOP1__S_FLBIT_I32::~Inst_SOP1__S_FLBIT_I32()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32
+
+    // --- description from .arch file ---
+    // D.i = FirstOppositeSignBit(S0.i);
+    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
+    // Counts how many bits in a row (from MSB to LSB) are the same as the
+    // sign bit.
+    void
+    Inst_SOP1__S_FLBIT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = firstOppositeSignBit(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_FLBIT_I32_I64 class methods ---
+
+    Inst_SOP1__S_FLBIT_I32_I64::Inst_SOP1__S_FLBIT_I32_I64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_flbit_i32_i64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_FLBIT_I32_I64
+
+    Inst_SOP1__S_FLBIT_I32_I64::~Inst_SOP1__S_FLBIT_I32_I64()
+    {
+    } // ~Inst_SOP1__S_FLBIT_I32_I64
+
+    // --- description from .arch file ---
+    // D.i = FirstOppositeSignBit(S0.i64);
+    // If S0.i == 0 or S0.i == -1 (all bits are the same), return -1.
+    // Counts how many bits in a row (from MSB to LSB) are the same as the
+    // sign bit.
+    void
+    Inst_SOP1__S_FLBIT_I32_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = firstOppositeSignBit(src.rawData());
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_SEXT_I32_I8 class methods ---
+
+    Inst_SOP1__S_SEXT_I32_I8::Inst_SOP1__S_SEXT_I32_I8(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_sext_i32_i8")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SEXT_I32_I8
+
+    Inst_SOP1__S_SEXT_I32_I8::~Inst_SOP1__S_SEXT_I32_I8()
+    {
+    } // ~Inst_SOP1__S_SEXT_I32_I8
+
+    // --- description from .arch file ---
+    // D.i = signext(S0.i[7:0]) (sign extension).
+    void
+    Inst_SOP1__S_SEXT_I32_I8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = sext<std::numeric_limits<ScalarRegI8>::digits>(
+            bits(src.rawData(), 7, 0));
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_SEXT_I32_I16 class methods ---
+
+    Inst_SOP1__S_SEXT_I32_I16::Inst_SOP1__S_SEXT_I32_I16(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_sext_i32_i16")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SEXT_I32_I16
+
+    Inst_SOP1__S_SEXT_I32_I16::~Inst_SOP1__S_SEXT_I32_I16()
+    {
+    } // ~Inst_SOP1__S_SEXT_I32_I16
+
+    // --- description from .arch file ---
+    // D.i = signext(S0.i[15:0]) (sign extension).
+    void
+    Inst_SOP1__S_SEXT_I32_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = sext<std::numeric_limits<ScalarRegI16>::digits>(
+            bits(src.rawData(), 15, 0));
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET0_B32 class methods ---
+
+    Inst_SOP1__S_BITSET0_B32::Inst_SOP1__S_BITSET0_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset0_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET0_B32
+
+    Inst_SOP1__S_BITSET0_B32::~Inst_SOP1__S_BITSET0_B32()
+    {
+    } // ~Inst_SOP1__S_BITSET0_B32
+
+    // --- description from .arch file ---
+    // D.u[S0.u[4:0]] = 0.
+    void
+    Inst_SOP1__S_BITSET0_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 4, 0), 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET0_B64 class methods ---
+
+    Inst_SOP1__S_BITSET0_B64::Inst_SOP1__S_BITSET0_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset0_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET0_B64
+
+    Inst_SOP1__S_BITSET0_B64::~Inst_SOP1__S_BITSET0_B64()
+    {
+    } // ~Inst_SOP1__S_BITSET0_B64
+
+    // --- description from .arch file ---
+    // D.u64[S0.u[5:0]] = 0.
+    void
+    Inst_SOP1__S_BITSET0_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 5, 0), 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET1_B32 class methods ---
+
+    Inst_SOP1__S_BITSET1_B32::Inst_SOP1__S_BITSET1_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset1_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET1_B32
+
+    Inst_SOP1__S_BITSET1_B32::~Inst_SOP1__S_BITSET1_B32()
+    {
+    } // ~Inst_SOP1__S_BITSET1_B32
+
+    // --- description from .arch file ---
+    // D.u[S0.u[4:0]] = 1.
+    void
+    Inst_SOP1__S_BITSET1_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 4, 0), 1);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_BITSET1_B64 class methods ---
+
+    Inst_SOP1__S_BITSET1_B64::Inst_SOP1__S_BITSET1_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_bitset1_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_BITSET1_B64
+
+    Inst_SOP1__S_BITSET1_B64::~Inst_SOP1__S_BITSET1_B64()
+    {
+    } // ~Inst_SOP1__S_BITSET1_B64
+
+    // --- description from .arch file ---
+    // D.u64[S0.u[5:0]] = 1.
+    void
+    Inst_SOP1__S_BITSET1_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst.setBit(bits(src.rawData(), 5, 0), 1);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_GETPC_B64 class methods ---
+
+    Inst_SOP1__S_GETPC_B64::Inst_SOP1__S_GETPC_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_getpc_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_GETPC_B64
+
+    Inst_SOP1__S_GETPC_B64::~Inst_SOP1__S_GETPC_B64()
+    {
+    } // ~Inst_SOP1__S_GETPC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = PC + 4.
+    // Destination receives the byte address of the next instruction.
+    void
+    Inst_SOP1__S_GETPC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Addr pc = gpuDynInst->pc();
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        sdst = pc + 4;
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_SETPC_B64 class methods ---
+
+    Inst_SOP1__S_SETPC_B64::Inst_SOP1__S_SETPC_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_setpc_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SETPC_B64
+
+    Inst_SOP1__S_SETPC_B64::~Inst_SOP1__S_SETPC_B64()
+    {
+    } // ~Inst_SOP1__S_SETPC_B64
+
+    // --- description from .arch file ---
+    // PC = S0.u64.
+    // S0.u64 is a byte address of the instruction to jump to.
+    void
+    Inst_SOP1__S_SETPC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+
+        src.read();
+
+        wf->pc(src.rawData());
+    } // execute
+    // --- Inst_SOP1__S_SWAPPC_B64 class methods ---
+
+    Inst_SOP1__S_SWAPPC_B64::Inst_SOP1__S_SWAPPC_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_swappc_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_SWAPPC_B64
+
+    Inst_SOP1__S_SWAPPC_B64::~Inst_SOP1__S_SWAPPC_B64()
+    {
+    } // ~Inst_SOP1__S_SWAPPC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = PC + 4; PC = S0.u64.
+    // S0.u64 is a byte address of the instruction to jump to.
+    void
+    Inst_SOP1__S_SWAPPC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = pc + 4;
+
+        wf->pc(src.rawData());
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_RFE_B64 class methods ---
+
+    Inst_SOP1__S_RFE_B64::Inst_SOP1__S_RFE_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_rfe_b64")
+    {
+    } // Inst_SOP1__S_RFE_B64
+
+    Inst_SOP1__S_RFE_B64::~Inst_SOP1__S_RFE_B64()
+    {
+    } // ~Inst_SOP1__S_RFE_B64
+
+    // --- description from .arch file ---
+    // PRIV = 0;
+    // PC = S0.u64.
+    // Return from exception handler and continue.
+    // This instruction may only be used within a trap handler.
+    void
+    Inst_SOP1__S_RFE_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP1__S_AND_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_AND_SAVEEXEC_B64::Inst_SOP1__S_AND_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_and_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_AND_SAVEEXEC_B64
+
+    Inst_SOP1__S_AND_SAVEEXEC_B64::~Inst_SOP1__S_AND_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_AND_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 & EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_AND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() & wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_OR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_OR_SAVEEXEC_B64::Inst_SOP1__S_OR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_or_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_OR_SAVEEXEC_B64
+
+    Inst_SOP1__S_OR_SAVEEXEC_B64::~Inst_SOP1__S_OR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_OR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 | EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_OR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() | wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_XOR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_XOR_SAVEEXEC_B64::Inst_SOP1__S_XOR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_xor_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_XOR_SAVEEXEC_B64
+
+    Inst_SOP1__S_XOR_SAVEEXEC_B64::~Inst_SOP1__S_XOR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_XOR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 ^ EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_XOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() ^ wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_ANDN2_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::Inst_SOP1__S_ANDN2_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_andn2_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_ANDN2_SAVEEXEC_B64
+
+    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::~Inst_SOP1__S_ANDN2_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_ANDN2_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 & ~EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_ANDN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() &~ wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_ORN2_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_ORN2_SAVEEXEC_B64::Inst_SOP1__S_ORN2_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_orn2_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_ORN2_SAVEEXEC_B64
+
+    Inst_SOP1__S_ORN2_SAVEEXEC_B64::~Inst_SOP1__S_ORN2_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_ORN2_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = S0.u64 | ~EXEC;
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_ORN2_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = src.rawData() |~ wf->execMask().to_ullong();
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_NAND_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_NAND_SAVEEXEC_B64::Inst_SOP1__S_NAND_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_nand_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_NAND_SAVEEXEC_B64
+
+    Inst_SOP1__S_NAND_SAVEEXEC_B64::~Inst_SOP1__S_NAND_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_NAND_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = ~(S0.u64 & EXEC);
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_NAND_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = ~(src.rawData() & wf->execMask().to_ullong());
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_NOR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_NOR_SAVEEXEC_B64::Inst_SOP1__S_NOR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_nor_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_NOR_SAVEEXEC_B64
+
+    Inst_SOP1__S_NOR_SAVEEXEC_B64::~Inst_SOP1__S_NOR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_NOR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = ~(S0.u64 | EXEC);
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_NOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = ~(src.rawData() | wf->execMask().to_ullong());
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_XNOR_SAVEEXEC_B64 class methods ---
+
+    Inst_SOP1__S_XNOR_SAVEEXEC_B64::Inst_SOP1__S_XNOR_SAVEEXEC_B64(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_xnor_saveexec_b64")
+    {
+        setFlag(ALU);
+        setFlag(ReadsEXEC);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_XNOR_SAVEEXEC_B64
+
+    Inst_SOP1__S_XNOR_SAVEEXEC_B64::~Inst_SOP1__S_XNOR_SAVEEXEC_B64()
+    {
+    } // ~Inst_SOP1__S_XNOR_SAVEEXEC_B64
+
+    // --- description from .arch file ---
+    // D.u64 = EXEC;
+    // EXEC = ~(S0.u64 ^ EXEC);
+    // SCC = 1 if the new value of EXEC is non-zero.
+    void
+    Inst_SOP1__S_XNOR_SAVEEXEC_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = wf->execMask().to_ullong();
+        wf->execMask() = ~(src.rawData() ^ wf->execMask().to_ullong());
+        scc = wf->execMask().any() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_QUADMASK_B32 class methods ---
+
+    Inst_SOP1__S_QUADMASK_B32::Inst_SOP1__S_QUADMASK_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_quadmask_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_QUADMASK_B32
+
+    Inst_SOP1__S_QUADMASK_B32::~Inst_SOP1__S_QUADMASK_B32()
+    {
+    } // ~Inst_SOP1__S_QUADMASK_B32
+
+    // --- description from .arch file ---
+    // D.u = QuadMask(S0.u):
+    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[31:8] = 0;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_QUADMASK_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = quadMask(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_QUADMASK_B64 class methods ---
+
+    Inst_SOP1__S_QUADMASK_B64::Inst_SOP1__S_QUADMASK_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_quadmask_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_QUADMASK_B64
+
+    Inst_SOP1__S_QUADMASK_B64::~Inst_SOP1__S_QUADMASK_B64()
+    {
+    } // ~Inst_SOP1__S_QUADMASK_B64
+
+    // --- description from .arch file ---
+    // D.u64 = QuadMask(S0.u64):
+    // D[0] = OR(S0[3:0]), D[1] = OR(S0[7:4]) ... D[63:16] = 0;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP1__S_QUADMASK_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = quadMask(src.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELS_B32 class methods ---
+
+    Inst_SOP1__S_MOVRELS_B32::Inst_SOP1__S_MOVRELS_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movrels_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELS_B32
+
+    Inst_SOP1__S_MOVRELS_B32::~Inst_SOP1__S_MOVRELS_B32()
+    {
+    } // ~Inst_SOP1__S_MOVRELS_B32
+
+    // --- description from .arch file ---
+    // D.u = SGPR[S0.u + M0.u].u (move from relative source).
+    void
+    Inst_SOP1__S_MOVRELS_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0 + m0.rawData());
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELS_B64 class methods ---
+
+    Inst_SOP1__S_MOVRELS_B64::Inst_SOP1__S_MOVRELS_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movrels_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELS_B64
+
+    Inst_SOP1__S_MOVRELS_B64::~Inst_SOP1__S_MOVRELS_B64()
+    {
+    } // ~Inst_SOP1__S_MOVRELS_B64
+
+    // --- description from .arch file ---
+    // D.u64 = SGPR[S0.u + M0.u].u64 (move from relative source).
+    // The index in M0.u must be even for this operation.
+    void
+    Inst_SOP1__S_MOVRELS_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0 + m0.rawData());
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELD_B32 class methods ---
+
+    Inst_SOP1__S_MOVRELD_B32::Inst_SOP1__S_MOVRELD_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movreld_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELD_B32
+
+    Inst_SOP1__S_MOVRELD_B32::~Inst_SOP1__S_MOVRELD_B32()
+    {
+    } // ~Inst_SOP1__S_MOVRELD_B32
+
+    // --- description from .arch file ---
+    // SGPR[D.u + M0.u].u = S0.u (move to relative destination).
+    void
+    Inst_SOP1__S_MOVRELD_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST + m0.rawData());
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_MOVRELD_B64 class methods ---
+
+    Inst_SOP1__S_MOVRELD_B64::Inst_SOP1__S_MOVRELD_B64(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_movreld_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOVRELD_B64
+
+    Inst_SOP1__S_MOVRELD_B64::~Inst_SOP1__S_MOVRELD_B64()
+    {
+    } // ~Inst_SOP1__S_MOVRELD_B64
+
+    // --- description from .arch file ---
+    // SGPR[D.u + M0.u].u64 = S0.u64 (move to relative destination).
+    // The index in M0.u must be even for this operation.
+    void
+    Inst_SOP1__S_MOVRELD_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 m0(gpuDynInst, REG_M0);
+        m0.read();
+        ConstScalarOperandU64 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST + m0.rawData());
+
+        src.read();
+
+        sdst = src.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP1__S_CBRANCH_JOIN class methods ---
+
+    Inst_SOP1__S_CBRANCH_JOIN::Inst_SOP1__S_CBRANCH_JOIN(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_cbranch_join")
+    {
+        setFlag(Branch);
+        setFlag(WritesEXEC);
+    } // Inst_SOP1__S_CBRANCH_JOIN
+
+    Inst_SOP1__S_CBRANCH_JOIN::~Inst_SOP1__S_CBRANCH_JOIN()
+    {
+    } // ~Inst_SOP1__S_CBRANCH_JOIN
+
+    // --- description from .arch file ---
+    // saved_csp = S0.u;
+    // if (CSP == saved_csp) then
+    //     PC += 4; // Second time to JOIN: continue with program.
+    // else
+    //     CSP -= 1; // First time to JOIN; jump to other FORK path.
+    //     {PC, EXEC} = SGPR[CSP * 4]; // Read 128 bits from 4 consecutive
+    //     SGPRs.
+    // end
+    // Conditional branch join point (end of conditional branch block). S0 is
+    // saved CSP value.
+    // See S_CBRANCH_G_FORK and S_CBRANCH_I_FORK for related instructions.
+    void
+    Inst_SOP1__S_CBRANCH_JOIN::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP1__S_ABS_I32 class methods ---
+
+    Inst_SOP1__S_ABS_I32::Inst_SOP1__S_ABS_I32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_abs_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_ABS_I32
+
+    Inst_SOP1__S_ABS_I32::~Inst_SOP1__S_ABS_I32()
+    {
+    } // ~Inst_SOP1__S_ABS_I32
+
+    // --- description from .arch file ---
+    // if (S.i < 0) then D.i = -S.i;
+    // else D.i = S.i;
+    // SCC = 1 if result is non-zero.
+    // Integer absolute value.
+    void
+    Inst_SOP1__S_ABS_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src(gpuDynInst, instData.SSRC0);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = std::abs(src.rawData());
+
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP1__S_MOV_FED_B32 class methods ---
+
+    Inst_SOP1__S_MOV_FED_B32::Inst_SOP1__S_MOV_FED_B32(InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_mov_fed_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP1__S_MOV_FED_B32
+
+    Inst_SOP1__S_MOV_FED_B32::~Inst_SOP1__S_MOV_FED_B32()
+    {
+    } // ~Inst_SOP1__S_MOV_FED_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u. Introduce an EDC double-detect error on write to the
+    // destination SGPR.
+    void
+    Inst_SOP1__S_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP1__S_SET_GPR_IDX_IDX class methods ---
+
+    Inst_SOP1__S_SET_GPR_IDX_IDX::Inst_SOP1__S_SET_GPR_IDX_IDX(
+          InFmt_SOP1 *iFmt)
+        : Inst_SOP1(iFmt, "s_set_gpr_idx_idx")
+    {
+    } // Inst_SOP1__S_SET_GPR_IDX_IDX
+
+    Inst_SOP1__S_SET_GPR_IDX_IDX::~Inst_SOP1__S_SET_GPR_IDX_IDX()
+    {
+    } // ~Inst_SOP1__S_SET_GPR_IDX_IDX
+
+    // --- description from .arch file ---
+    // M0[7:0] = S0.u[7:0].
+    // Modify the index used in vector GPR indexing.
+    void
+    Inst_SOP1__S_SET_GPR_IDX_IDX::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sop2.cc b/src/arch/amdgpu/vega/insts/sop2.cc
new file mode 100644
index 0000000000..a2965763f7
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sop2.cc
@@ -0,0 +1,1556 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOP2__S_ADD_U32 class methods ---
+
+    Inst_SOP2__S_ADD_U32::Inst_SOP2__S_ADD_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_add_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ADD_U32
+
+    Inst_SOP2__S_ADD_U32::~Inst_SOP2__S_ADD_U32()
+    {
+    } // ~Inst_SOP2__S_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    // SCC = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an unsigned
+    // ---  overflow/carry-out for S_ADDC_U32.
+    void
+    Inst_SOP2__S_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() + src1.rawData();
+        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData())
+            >= 0x100000000ULL ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_SUB_U32 class methods ---
+
+    Inst_SOP2__S_SUB_U32::Inst_SOP2__S_SUB_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_sub_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_SUB_U32
+
+    Inst_SOP2__S_SUB_U32::~Inst_SOP2__S_SUB_U32()
+    {
+    } // ~Inst_SOP2__S_SUB_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    // SCC = (S1.u > S0.u ? 1 : 0) is an unsigned overflow or carry-out for
+    // ---  S_SUBB_U32.
+    void
+    Inst_SOP2__S_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() - src1.rawData();
+        scc = (src1.rawData() > src0.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ADD_I32 class methods ---
+
+    Inst_SOP2__S_ADD_I32::Inst_SOP2__S_ADD_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_add_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ADD_I32
+
+    Inst_SOP2__S_ADD_I32::~Inst_SOP2__S_ADD_I32()
+    {
+    } // ~Inst_SOP2__S_ADD_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i + S1.i;
+    // SCC = (S0.u[31] == S1.u[31] && S0.u[31] != D.u[31]) is a signed
+    // overflow.
+    // This opcode is not suitable for use with S_ADDC_U32 for implementing
+    // 64-bit operations.
+    void
+    Inst_SOP2__S_ADD_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() + src1.rawData();
+        scc = (bits(src0.rawData(), 31) == bits(src1.rawData(), 31)
+            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31))
+            ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_SUB_I32 class methods ---
+
+    Inst_SOP2__S_SUB_I32::Inst_SOP2__S_SUB_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_sub_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_SUB_I32
+
+    Inst_SOP2__S_SUB_I32::~Inst_SOP2__S_SUB_I32()
+    {
+    } // ~Inst_SOP2__S_SUB_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i - S1.i;
+    // SCC = (S0.u[31] != S1.u[31] && S0.u[31] != D.u[31]) is a signed
+    // overflow.
+    // CAUTION: The condition code behaviour for this opcode is inconsistent
+    // with V_SUB_I32; see V_SUB_I32 for further details.
+    // This opcode is not suitable for use with S_SUBB_U32 for implementing
+    // 64-bit operations.
+    void
+    Inst_SOP2__S_SUB_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() - src1.rawData();
+        scc = (bits(src0.rawData(), 31) != bits(src1.rawData(), 31)
+            && bits(src0.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ADDC_U32 class methods ---
+
+    Inst_SOP2__S_ADDC_U32::Inst_SOP2__S_ADDC_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_addc_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ADDC_U32
+
+    Inst_SOP2__S_ADDC_U32::~Inst_SOP2__S_ADDC_U32()
+    {
+    } // ~Inst_SOP2__S_ADDC_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + SCC;
+    // SCC = (S0.u + S1.u + SCC >= 0x800000000ULL ? 1 : 0) is an unsigned
+    // overflow.
+    void
+    Inst_SOP2__S_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = src0.rawData() + src1.rawData() + scc.rawData();
+        scc = ((ScalarRegU64)src0.rawData() + (ScalarRegU64)src1.rawData()
+            + (ScalarRegU64)scc.rawData()) >= 0x100000000ULL ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_SUBB_U32 class methods ---
+
+    Inst_SOP2__S_SUBB_U32::Inst_SOP2__S_SUBB_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_subb_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_SUBB_U32
+
+    Inst_SOP2__S_SUBB_U32::~Inst_SOP2__S_SUBB_U32()
+    {
+    } // ~Inst_SOP2__S_SUBB_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u - SCC;
+    // SCC = (S1.u + SCC > S0.u ? 1 : 0) is an unsigned overflow.
+    void
+    Inst_SOP2__S_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = src0.rawData() - src1.rawData() - scc.rawData();
+        scc = (src1.rawData() + scc.rawData()) > src0.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MIN_I32 class methods ---
+
+    Inst_SOP2__S_MIN_I32::Inst_SOP2__S_MIN_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_min_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MIN_I32
+
+    Inst_SOP2__S_MIN_I32::~Inst_SOP2__S_MIN_I32()
+    {
+    } // ~Inst_SOP2__S_MIN_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i < S1.i) ? S0.i : S1.i;
+    // SCC = 1 if S0 is chosen as the minimum value.
+    void
+    Inst_SOP2__S_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::min(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MIN_U32 class methods ---
+
+    Inst_SOP2__S_MIN_U32::Inst_SOP2__S_MIN_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_min_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MIN_U32
+
+    Inst_SOP2__S_MIN_U32::~Inst_SOP2__S_MIN_U32()
+    {
+    } // ~Inst_SOP2__S_MIN_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u < S1.u) ? S0.u : S1.u;
+    // SCC = 1 if S0 is chosen as the minimum value.
+    void
+    Inst_SOP2__S_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::min(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MAX_I32 class methods ---
+
+    Inst_SOP2__S_MAX_I32::Inst_SOP2__S_MAX_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_max_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MAX_I32
+
+    Inst_SOP2__S_MAX_I32::~Inst_SOP2__S_MAX_I32()
+    {
+    } // ~Inst_SOP2__S_MAX_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i > S1.i) ? S0.i : S1.i;
+    // SCC = 1 if S0 is chosen as the maximum value.
+    void
+    Inst_SOP2__S_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::max(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_MAX_U32 class methods ---
+
+    Inst_SOP2__S_MAX_U32::Inst_SOP2__S_MAX_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_max_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MAX_U32
+
+    Inst_SOP2__S_MAX_U32::~Inst_SOP2__S_MAX_U32()
+    {
+    } // ~Inst_SOP2__S_MAX_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u > S1.u) ? S0.u : S1.u;
+    // SCC = 1 if S0 is chosen as the maximum value.
+    void
+    Inst_SOP2__S_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = std::max(src0.rawData(), src1.rawData());
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_CSELECT_B32 class methods ---
+
+    Inst_SOP2__S_CSELECT_B32::Inst_SOP2__S_CSELECT_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_cselect_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_CSELECT_B32
+
+    Inst_SOP2__S_CSELECT_B32::~Inst_SOP2__S_CSELECT_B32()
+    {
+    } // ~Inst_SOP2__S_CSELECT_B32
+
+    // --- description from .arch file ---
+    // D.u = SCC ? S0.u : S1.u (conditional select).
+    void
+    Inst_SOP2__S_CSELECT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_CSELECT_B64 class methods ---
+
+    Inst_SOP2__S_CSELECT_B64::Inst_SOP2__S_CSELECT_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_cselect_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_CSELECT_B64
+
+    Inst_SOP2__S_CSELECT_B64::~Inst_SOP2__S_CSELECT_B64()
+    {
+    } // ~Inst_SOP2__S_CSELECT_B64
+
+    // --- description from .arch file ---
+    // D.u64 = SCC ? S0.u64 : S1.u64 (conditional select).
+    void
+    Inst_SOP2__S_CSELECT_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+        scc.read();
+
+        sdst = scc.rawData() ? src0.rawData() : src1.rawData();
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_AND_B32 class methods ---
+
+    Inst_SOP2__S_AND_B32::Inst_SOP2__S_AND_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_and_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_AND_B32
+
+    Inst_SOP2__S_AND_B32::~Inst_SOP2__S_AND_B32()
+    {
+    } // ~Inst_SOP2__S_AND_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() & src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_AND_B64 class methods ---
+
+    Inst_SOP2__S_AND_B64::Inst_SOP2__S_AND_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_and_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_AND_B64
+
+    Inst_SOP2__S_AND_B64::~Inst_SOP2__S_AND_B64()
+    {
+    } // ~Inst_SOP2__S_AND_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 & S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_AND_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() & src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_OR_B32 class methods ---
+
+    Inst_SOP2__S_OR_B32::Inst_SOP2__S_OR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_or_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_OR_B32
+
+    Inst_SOP2__S_OR_B32::~Inst_SOP2__S_OR_B32()
+    {
+    } // ~Inst_SOP2__S_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() | src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_OR_B64 class methods ---
+
+    Inst_SOP2__S_OR_B64::Inst_SOP2__S_OR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_or_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_OR_B64
+
+    Inst_SOP2__S_OR_B64::~Inst_SOP2__S_OR_B64()
+    {
+    } // ~Inst_SOP2__S_OR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 | S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_OR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() | src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XOR_B32 class methods ---
+
+    Inst_SOP2__S_XOR_B32::Inst_SOP2__S_XOR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XOR_B32
+
+    Inst_SOP2__S_XOR_B32::~Inst_SOP2__S_XOR_B32()
+    {
+    } // ~Inst_SOP2__S_XOR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u ^ S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() ^ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XOR_B64 class methods ---
+
+    Inst_SOP2__S_XOR_B64::Inst_SOP2__S_XOR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xor_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XOR_B64
+
+    Inst_SOP2__S_XOR_B64::~Inst_SOP2__S_XOR_B64()
+    {
+    } // ~Inst_SOP2__S_XOR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 ^ S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() ^ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ANDN2_B32 class methods ---
+
+    Inst_SOP2__S_ANDN2_B32::Inst_SOP2__S_ANDN2_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_andn2_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ANDN2_B32
+
+    Inst_SOP2__S_ANDN2_B32::~Inst_SOP2__S_ANDN2_B32()
+    {
+    } // ~Inst_SOP2__S_ANDN2_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & ~S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ANDN2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() &~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ANDN2_B64 class methods ---
+
+    Inst_SOP2__S_ANDN2_B64::Inst_SOP2__S_ANDN2_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_andn2_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ANDN2_B64
+
+    Inst_SOP2__S_ANDN2_B64::~Inst_SOP2__S_ANDN2_B64()
+    {
+    } // ~Inst_SOP2__S_ANDN2_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 & ~S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ANDN2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() &~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ORN2_B32 class methods ---
+
+    Inst_SOP2__S_ORN2_B32::Inst_SOP2__S_ORN2_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_orn2_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ORN2_B32
+
+    Inst_SOP2__S_ORN2_B32::~Inst_SOP2__S_ORN2_B32()
+    {
+    } // ~Inst_SOP2__S_ORN2_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | ~S1.u;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ORN2_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() |~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ORN2_B64 class methods ---
+
+    Inst_SOP2__S_ORN2_B64::Inst_SOP2__S_ORN2_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_orn2_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ORN2_B64
+
+    Inst_SOP2__S_ORN2_B64::~Inst_SOP2__S_ORN2_B64()
+    {
+    } // ~Inst_SOP2__S_ORN2_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 | ~S1.u64;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_ORN2_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = src0.rawData() |~ src1.rawData();
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NAND_B32 class methods ---
+
+    Inst_SOP2__S_NAND_B32::Inst_SOP2__S_NAND_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nand_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NAND_B32
+
+    Inst_SOP2__S_NAND_B32::~Inst_SOP2__S_NAND_B32()
+    {
+    } // ~Inst_SOP2__S_NAND_B32
+
+    // --- description from .arch file ---
+    // D.u = ~(S0.u & S1.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NAND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() & src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NAND_B64 class methods ---
+
+    Inst_SOP2__S_NAND_B64::Inst_SOP2__S_NAND_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nand_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NAND_B64
+
+    Inst_SOP2__S_NAND_B64::~Inst_SOP2__S_NAND_B64()
+    {
+    } // ~Inst_SOP2__S_NAND_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~(S0.u64 & S1.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NAND_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() & src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NOR_B32 class methods ---
+
+    Inst_SOP2__S_NOR_B32::Inst_SOP2__S_NOR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NOR_B32
+
+    Inst_SOP2__S_NOR_B32::~Inst_SOP2__S_NOR_B32()
+    {
+    } // ~Inst_SOP2__S_NOR_B32
+
+    // --- description from .arch file ---
+    // D.u = ~(S0.u | S1.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() | src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_NOR_B64 class methods ---
+
+    Inst_SOP2__S_NOR_B64::Inst_SOP2__S_NOR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_nor_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_NOR_B64
+
+    Inst_SOP2__S_NOR_B64::~Inst_SOP2__S_NOR_B64()
+    {
+    } // ~Inst_SOP2__S_NOR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~(S0.u64 | S1.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_NOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() | src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XNOR_B32 class methods ---
+
+    Inst_SOP2__S_XNOR_B32::Inst_SOP2__S_XNOR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xnor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XNOR_B32
+
+    Inst_SOP2__S_XNOR_B32::~Inst_SOP2__S_XNOR_B32()
+    {
+    } // ~Inst_SOP2__S_XNOR_B32
+
+    // --- description from .arch file ---
+    // D.u = ~(S0.u ^ S1.u);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() ^ src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_XNOR_B64 class methods ---
+
+    Inst_SOP2__S_XNOR_B64::Inst_SOP2__S_XNOR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_xnor_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_XNOR_B64
+
+    Inst_SOP2__S_XNOR_B64::~Inst_SOP2__S_XNOR_B64()
+    {
+    } // ~Inst_SOP2__S_XNOR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ~(S0.u64 ^ S1.u64);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_XNOR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = ~(src0.rawData() ^ src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHL_B32 class methods ---
+
+    Inst_SOP2__S_LSHL_B32::Inst_SOP2__S_LSHL_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshl_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHL_B32
+
+    Inst_SOP2__S_LSHL_B32::~Inst_SOP2__S_LSHL_B32()
+    {
+    } // ~Inst_SOP2__S_LSHL_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u << S1.u[4:0];
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_LSHL_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() << bits(src1.rawData(), 4, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHL_B64 class methods ---
+
+    Inst_SOP2__S_LSHL_B64::Inst_SOP2__S_LSHL_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshl_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHL_B64
+
+    Inst_SOP2__S_LSHL_B64::~Inst_SOP2__S_LSHL_B64()
+    {
+    } // ~Inst_SOP2__S_LSHL_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 << S1.u[5:0];
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_LSHL_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() << bits(src1.rawData(), 5, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHR_B32 class methods ---
+
+    Inst_SOP2__S_LSHR_B32::Inst_SOP2__S_LSHR_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshr_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHR_B32
+
+    Inst_SOP2__S_LSHR_B32::~Inst_SOP2__S_LSHR_B32()
+    {
+    } // ~Inst_SOP2__S_LSHR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u >> S1.u[4:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to zero.
+    void
+    Inst_SOP2__S_LSHR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_LSHR_B64 class methods ---
+
+    Inst_SOP2__S_LSHR_B64::Inst_SOP2__S_LSHR_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_lshr_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_LSHR_B64
+
+    Inst_SOP2__S_LSHR_B64::~Inst_SOP2__S_LSHR_B64()
+    {
+    } // ~Inst_SOP2__S_LSHR_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S0.u64 >> S1.u[5:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to zero.
+    void
+    Inst_SOP2__S_LSHR_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ASHR_I32 class methods ---
+
+    Inst_SOP2__S_ASHR_I32::Inst_SOP2__S_ASHR_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_ashr_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ASHR_I32
+
+    Inst_SOP2__S_ASHR_I32::~Inst_SOP2__S_ASHR_I32()
+    {
+    } // ~Inst_SOP2__S_ASHR_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(S0.i) >> S1.u[4:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to the sign bit of the input value.
+    void
+    Inst_SOP2__S_ASHR_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_ASHR_I64 class methods ---
+
+    Inst_SOP2__S_ASHR_I64::Inst_SOP2__S_ASHR_I64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_ashr_i64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ASHR_I64
+
+    Inst_SOP2__S_ASHR_I64::~Inst_SOP2__S_ASHR_I64()
+    {
+    } // ~Inst_SOP2__S_ASHR_I64
+
+    // --- description from .arch file ---
+    // D.i64 = signext(S0.i64) >> S1.u[5:0];
+    // SCC = 1 if result is non-zero.
+    // The vacated bits are set to the sign bit of the input value.
+    void
+    Inst_SOP2__S_ASHR_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0));
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFM_B32 class methods ---
+
+    Inst_SOP2__S_BFM_B32::Inst_SOP2__S_BFM_B32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfm_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFM_B32
+
+    Inst_SOP2__S_BFM_B32::~Inst_SOP2__S_BFM_B32()
+    {
+    } // ~Inst_SOP2__S_BFM_B32
+
+    // --- description from .arch file ---
+    // D.u = ((1 << S0.u[4:0]) - 1) << S1.u[4:0] (bitfield mask).
+    void
+    Inst_SOP2__S_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        sdst = ((1 << bits(src0.rawData(), 4, 0)) - 1)
+            << bits(src1.rawData(), 4, 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_BFM_B64 class methods ---
+
+    Inst_SOP2__S_BFM_B64::Inst_SOP2__S_BFM_B64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfm_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFM_B64
+
+    Inst_SOP2__S_BFM_B64::~Inst_SOP2__S_BFM_B64()
+    {
+    } // ~Inst_SOP2__S_BFM_B64
+
+    // --- description from .arch file ---
+    // D.u64 = ((1ULL << S0.u[5:0]) - 1) << S1.u[5:0] (bitfield mask).
+    void
+    Inst_SOP2__S_BFM_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        sdst = ((1ULL << bits(src0.rawData(), 5, 0)) - 1)
+            << bits(src1.rawData(), 5, 0);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_MUL_I32 class methods ---
+
+    Inst_SOP2__S_MUL_I32::Inst_SOP2__S_MUL_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_mul_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MUL_I32
+
+    Inst_SOP2__S_MUL_I32::~Inst_SOP2__S_MUL_I32()
+    {
+    } // ~Inst_SOP2__S_MUL_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i * S1.i.
+    void
+    Inst_SOP2__S_MUL_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        ScalarRegI64 tmp = src0.rawData() * src1.rawData();
+        sdst = tmp & mask(32);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_U32 class methods ---
+
+    Inst_SOP2__S_BFE_U32::Inst_SOP2__S_BFE_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_U32
+
+    Inst_SOP2__S_BFE_U32::~Inst_SOP2__S_BFE_U32()
+    {
+    } // ~Inst_SOP2__S_BFE_U32
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
+    // field width.
+    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_I32 class methods ---
+
+    Inst_SOP2__S_BFE_I32::Inst_SOP2__S_BFE_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_I32
+
+    Inst_SOP2__S_BFE_I32::~Inst_SOP2__S_BFE_I32()
+    {
+    } // ~Inst_SOP2__S_BFE_I32
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[4:0] is field offset, S1[22:16] is
+    // field width.
+    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S1.u[22:16])-1);
+    // Sign-extend the result;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 4, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+
+        // Above extracted a signed int of size src1[22:16] bits which needs
+        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
+        // integer is 1, and sign extend it is.
+        //
+        // Note: The description in the Vega ISA manual does not mention to
+        // sign-extend the result. An update description can be found in the
+        // more recent RDNA3 manual here:
+        // https://developer.amd.com/wp-content/resources/
+        //      RDNA3_Shader_ISA_December2022.pdf
+        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
+            sdst = sdst.rawData()
+                 | (0xffffffff << bits(src1.rawData(), 22, 16));
+        }
+
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_U64 class methods ---
+
+    Inst_SOP2__S_BFE_U64::Inst_SOP2__S_BFE_U64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_u64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_U64
+
+    Inst_SOP2__S_BFE_U64::~Inst_SOP2__S_BFE_U64()
+    {
+    } // ~Inst_SOP2__S_BFE_U64
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
+    // field width.
+    // D.u64 = (S0.u64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_BFE_I64 class methods ---
+
+    Inst_SOP2__S_BFE_I64::Inst_SOP2__S_BFE_I64(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_bfe_i64")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_BFE_I64
+
+    Inst_SOP2__S_BFE_I64::~Inst_SOP2__S_BFE_I64()
+    {
+    } // ~Inst_SOP2__S_BFE_I64
+
+    // --- description from .arch file ---
+    // Bit field extract. S0 is Data, S1[5:0] is field offset, S1[22:16] is
+    // field width.
+    // D.i64 = (S0.i64>>S1.u[5:0]) & ((1<<S1.u[22:16])-1);
+    // Sign-extend result;
+    // SCC = 1 if result is non-zero.
+    void
+    Inst_SOP2__S_BFE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        sdst = (src0.rawData() >> bits(src1.rawData(), 5, 0))
+            & ((1 << bits(src1.rawData(), 22, 16)) - 1);
+
+        // Above extracted a signed int of size src1[22:16] bits which needs
+        // to be signed-extended. Check if the MSB of our src1[22:16]-bit
+        // integer is 1, and sign extend it is.
+        if (sdst.rawData() >> (bits(src1.rawData(), 22, 16) - 1)) {
+            sdst = sdst.rawData()
+                 | 0xffffffffffffffff << bits(src1.rawData(), 22, 16);
+        }
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_CBRANCH_G_FORK class methods ---
+
+    Inst_SOP2__S_CBRANCH_G_FORK::Inst_SOP2__S_CBRANCH_G_FORK(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_cbranch_g_fork")
+    {
+        setFlag(Branch);
+    } // Inst_SOP2__S_CBRANCH_G_FORK
+
+    Inst_SOP2__S_CBRANCH_G_FORK::~Inst_SOP2__S_CBRANCH_G_FORK()
+    {
+    } // ~Inst_SOP2__S_CBRANCH_G_FORK
+
+    // --- description from .arch file ---
+    // mask_pass = S0.u64 & EXEC;
+    // mask_fail = ~S0.u64 & EXEC;
+    // if (mask_pass == EXEC)
+    //     PC = S1.u64;
+    // elsif (mask_fail == EXEC)
+    //     PC += 4;
+    // elsif (bitcount(mask_fail) < bitcount(mask_pass))
+    //     EXEC = mask_fail;
+    //     SGPR[CSP*4] = { S1.u64, mask_pass };
+    //     CSP++;
+    //     PC += 4;
+    // else
+    //     EXEC = mask_pass;
+    //     SGPR[CSP*4] = { PC + 4, mask_fail };
+    //     CSP++;
+    //     PC = S1.u64;
+    // end.
+    // Conditional branch using branch-stack.
+    // S0 = compare mask(vcc or any sgpr) and
+    // S1 = 64-bit byte address of target instruction.
+    // See also S_CBRANCH_JOIN.
+    void
+    Inst_SOP2__S_CBRANCH_G_FORK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP2__S_ABSDIFF_I32 class methods ---
+
+    Inst_SOP2__S_ABSDIFF_I32::Inst_SOP2__S_ABSDIFF_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_absdiff_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_ABSDIFF_I32
+
+    Inst_SOP2__S_ABSDIFF_I32::~Inst_SOP2__S_ABSDIFF_I32()
+    {
+    } // ~Inst_SOP2__S_ABSDIFF_I32
+
+    // --- description from .arch file ---
+    // D.i = S0.i - S1.i;
+    // if (D.i < 0) then D.i = -D.i;
+    // SCC = 1 if result is non-zero.
+    // Compute the absolute value of difference between two values.
+    void
+    Inst_SOP2__S_ABSDIFF_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        sdst = std::abs(src0.rawData() - src1.rawData());
+        scc = sdst.rawData() ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOP2__S_RFE_RESTORE_B64 class methods ---
+
+    Inst_SOP2__S_RFE_RESTORE_B64::Inst_SOP2__S_RFE_RESTORE_B64(
+          InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_rfe_restore_b64")
+    {
+    } // Inst_SOP2__S_RFE_RESTORE_B64
+
+    Inst_SOP2__S_RFE_RESTORE_B64::~Inst_SOP2__S_RFE_RESTORE_B64()
+    {
+    } // ~Inst_SOP2__S_RFE_RESTORE_B64
+
+    // --- description from .arch file ---
+    // PRIV = 0;
+    // PC = S0.u64;
+    // INST_ATC = S1.u32[0].
+    // Return from exception handler and continue, possibly changing the
+    // ---  instruction ATC mode.
+    // This instruction may only be used within a trap handler.
+    // Use this instruction when the main program may be in a different memory
+    // ---  space than the trap handler.
+    void
+    Inst_SOP2__S_RFE_RESTORE_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOP2__S_MUL_HI_U32 class methods ---
+
+    Inst_SOP2__S_MUL_HI_U32::Inst_SOP2__S_MUL_HI_U32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_mul_hi_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MUL_HI_U32
+
+    Inst_SOP2__S_MUL_HI_U32::~Inst_SOP2__S_MUL_HI_U32()
+    {
+    } // ~Inst_SOP2__S_MUL_HI_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u * S1.u) >> 32;
+    void
+    Inst_SOP2__S_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        VecElemU64 tmp_dst =
+            ((VecElemU64)src0.rawData() * (VecElemU64)src1.rawData());
+        sdst = (tmp_dst >> 32);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOP2__S_MUL_HI_I32 class methods ---
+
+    Inst_SOP2__S_MUL_HI_I32::Inst_SOP2__S_MUL_HI_I32(InFmt_SOP2 *iFmt)
+        : Inst_SOP2(iFmt, "s_mul_hi_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOP2__S_MUL_HI_I32
+
+    Inst_SOP2__S_MUL_HI_I32::~Inst_SOP2__S_MUL_HI_I32()
+    {
+    } // ~Inst_SOP2__S_MUL_HI_I32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u * S1.u) >> 32;
+    void
+    Inst_SOP2__S_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src0.read();
+        src1.read();
+
+        VecElemI64 tmp_src0 =
+            sext<std::numeric_limits<VecElemI64>::digits>(src0.rawData());
+        VecElemI64 tmp_src1 =
+            sext<std::numeric_limits<VecElemI64>::digits>(src1.rawData());
+        sdst = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
+
+        sdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sopc.cc b/src/arch/amdgpu/vega/insts/sopc.cc
new file mode 100644
index 0000000000..9c58688e53
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sopc.cc
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPC__S_CMP_EQ_I32 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_I32::Inst_SOPC__S_CMP_EQ_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_I32
+
+    Inst_SOPC__S_CMP_EQ_I32::~Inst_SOPC__S_CMP_EQ_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i == S1.i).
+    void
+    Inst_SOPC__S_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LG_I32::Inst_SOPC__S_CMP_LG_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_I32
+
+    Inst_SOPC__S_CMP_LG_I32::~Inst_SOPC__S_CMP_LG_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i != S1.i).
+    void
+    Inst_SOPC__S_CMP_LG_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GT_I32 class methods ---
+
+    Inst_SOPC__S_CMP_GT_I32::Inst_SOPC__S_CMP_GT_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_gt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GT_I32
+
+    Inst_SOPC__S_CMP_GT_I32::~Inst_SOPC__S_CMP_GT_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_GT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i > S1.i).
+    void
+    Inst_SOPC__S_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GE_I32 class methods ---
+
+    Inst_SOPC__S_CMP_GE_I32::Inst_SOPC__S_CMP_GE_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_ge_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GE_I32
+
+    Inst_SOPC__S_CMP_GE_I32::~Inst_SOPC__S_CMP_GE_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_GE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i >= S1.i).
+    void
+    Inst_SOPC__S_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LT_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LT_I32::Inst_SOPC__S_CMP_LT_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LT_I32
+
+    Inst_SOPC__S_CMP_LT_I32::~Inst_SOPC__S_CMP_LT_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i < S1.i).
+    void
+    Inst_SOPC__S_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LE_I32 class methods ---
+
+    Inst_SOPC__S_CMP_LE_I32::Inst_SOPC__S_CMP_LE_I32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_le_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LE_I32
+
+    Inst_SOPC__S_CMP_LE_I32::~Inst_SOPC__S_CMP_LE_I32()
+    {
+    } // ~Inst_SOPC__S_CMP_LE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i <= S1.i).
+    void
+    Inst_SOPC__S_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_EQ_U32 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_U32::Inst_SOPC__S_CMP_EQ_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_U32
+
+    Inst_SOPC__S_CMP_EQ_U32::~Inst_SOPC__S_CMP_EQ_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u == S1.u).
+    void
+    Inst_SOPC__S_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LG_U32::Inst_SOPC__S_CMP_LG_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_U32
+
+    Inst_SOPC__S_CMP_LG_U32::~Inst_SOPC__S_CMP_LG_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u != S1.u).
+    void
+    Inst_SOPC__S_CMP_LG_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GT_U32 class methods ---
+
+    Inst_SOPC__S_CMP_GT_U32::Inst_SOPC__S_CMP_GT_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_gt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GT_U32
+
+    Inst_SOPC__S_CMP_GT_U32::~Inst_SOPC__S_CMP_GT_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_GT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u > S1.u).
+    void
+    Inst_SOPC__S_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() > src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_GE_U32 class methods ---
+
+    Inst_SOPC__S_CMP_GE_U32::Inst_SOPC__S_CMP_GE_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_ge_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_GE_U32
+
+    Inst_SOPC__S_CMP_GE_U32::~Inst_SOPC__S_CMP_GE_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_GE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u >= S1.u).
+    void
+    Inst_SOPC__S_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() >= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LT_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LT_U32::Inst_SOPC__S_CMP_LT_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LT_U32
+
+    Inst_SOPC__S_CMP_LT_U32::~Inst_SOPC__S_CMP_LT_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u < S1.u).
+    void
+    Inst_SOPC__S_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() < src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LE_U32 class methods ---
+
+    Inst_SOPC__S_CMP_LE_U32::Inst_SOPC__S_CMP_LE_U32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_le_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LE_U32
+
+    Inst_SOPC__S_CMP_LE_U32::~Inst_SOPC__S_CMP_LE_U32()
+    {
+    } // ~Inst_SOPC__S_CMP_LE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u <= S1.u).
+    void
+    Inst_SOPC__S_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() <= src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP0_B32 class methods ---
+
+    Inst_SOPC__S_BITCMP0_B32::Inst_SOPC__S_BITCMP0_B32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp0_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP0_B32
+
+    Inst_SOPC__S_BITCMP0_B32::~Inst_SOPC__S_BITCMP0_B32()
+    {
+    } // ~Inst_SOPC__S_BITCMP0_B32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u[S1.u[4:0]] == 0).
+    void
+    Inst_SOPC__S_BITCMP0_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = !bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP1_B32 class methods ---
+
+    Inst_SOPC__S_BITCMP1_B32::Inst_SOPC__S_BITCMP1_B32(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp1_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP1_B32
+
+    Inst_SOPC__S_BITCMP1_B32::~Inst_SOPC__S_BITCMP1_B32()
+    {
+    } // ~Inst_SOPC__S_BITCMP1_B32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u[S1.u[4:0]] == 1).
+    void
+    Inst_SOPC__S_BITCMP1_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = bits(src0.rawData(), bits(src1.rawData(), 4, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP0_B64 class methods ---
+
+    Inst_SOPC__S_BITCMP0_B64::Inst_SOPC__S_BITCMP0_B64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp0_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP0_B64
+
+    Inst_SOPC__S_BITCMP0_B64::~Inst_SOPC__S_BITCMP0_B64()
+    {
+    } // ~Inst_SOPC__S_BITCMP0_B64
+
+    // --- description from .arch file ---
+    // SCC = (S0.u64[S1.u[5:0]] == 0).
+    void
+    Inst_SOPC__S_BITCMP0_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = !bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_BITCMP1_B64 class methods ---
+
+    Inst_SOPC__S_BITCMP1_B64::Inst_SOPC__S_BITCMP1_B64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_bitcmp1_b64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_BITCMP1_B64
+
+    Inst_SOPC__S_BITCMP1_B64::~Inst_SOPC__S_BITCMP1_B64()
+    {
+    } // ~Inst_SOPC__S_BITCMP1_B64
+
+    // --- description from .arch file ---
+    // SCC = (S0.u64[S1.u[5:0]] == 1).
+    void
+    Inst_SOPC__S_BITCMP1_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = bits(src0.rawData(), bits(src1.rawData(), 5, 0)) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_SETVSKIP class methods ---
+
+    Inst_SOPC__S_SETVSKIP::Inst_SOPC__S_SETVSKIP(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_setvskip")
+    {
+    } // Inst_SOPC__S_SETVSKIP
+
+    Inst_SOPC__S_SETVSKIP::~Inst_SOPC__S_SETVSKIP()
+    {
+    } // ~Inst_SOPC__S_SETVSKIP
+
+    // --- description from .arch file ---
+    // VSKIP = S0.u[S1.u[4:0]].
+    // Enables and disables VSKIP mode.
+    // When VSKIP is enabled, no VOP*/M*BUF/MIMG/DS/FLAT/EXP instuctions are
+    // issued.
+    // If any vector operations are outstanding, S_WAITCNT must be issued
+    // before executing.
+    // This instruction requires one waitstate after executing (e.g. S_NOP 0).
+    // Example:
+    //     s_waitcnt 0
+    //     s_setvskip 1, 0  // Enable vskip mode.
+    //     s_nop 1
+    void
+    Inst_SOPC__S_SETVSKIP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPC__S_SET_GPR_IDX_ON class methods ---
+
+    Inst_SOPC__S_SET_GPR_IDX_ON::Inst_SOPC__S_SET_GPR_IDX_ON(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_set_gpr_idx_on")
+    {
+    } // Inst_SOPC__S_SET_GPR_IDX_ON
+
+    Inst_SOPC__S_SET_GPR_IDX_ON::~Inst_SOPC__S_SET_GPR_IDX_ON()
+    {
+    } // ~Inst_SOPC__S_SET_GPR_IDX_ON
+
+    // --- description from .arch file ---
+    // MODE.gpr_idx_en = 1;
+    // M0[7:0] = S0.u[7:0];
+    // M0[15:12] = SIMM4 (direct contents of S1 field);
+    // // Remaining bits of M0 are unmodified.
+    // Enable GPR indexing mode. Vector operations after this will perform
+    // relative GPR addressing based on the contents of M0. The structure
+    // SQ_M0_GPR_IDX_WORD may be used to decode M0.
+    // The raw contents of the S1 field are read and used to set the enable
+    // bits. S1[0] = VSRC0_REL, S1[1] = VSRC1_REL, S1[2] = VSRC2_REL and
+    // S1[3] = VDST_REL.
+    void
+    Inst_SOPC__S_SET_GPR_IDX_ON::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPC__S_CMP_EQ_U64 class methods ---
+
+    Inst_SOPC__S_CMP_EQ_U64::Inst_SOPC__S_CMP_EQ_U64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_eq_u64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_EQ_U64
+
+    Inst_SOPC__S_CMP_EQ_U64::~Inst_SOPC__S_CMP_EQ_U64()
+    {
+    } // ~Inst_SOPC__S_CMP_EQ_U64
+
+    // --- description from .arch file ---
+    // SCC = (S0.i64 == S1.i64).
+    void
+    Inst_SOPC__S_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() == src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPC__S_CMP_LG_U64 class methods ---
+
+    Inst_SOPC__S_CMP_LG_U64::Inst_SOPC__S_CMP_LG_U64(InFmt_SOPC *iFmt)
+        : Inst_SOPC(iFmt, "s_cmp_lg_u64")
+    {
+        setFlag(ALU);
+    } // Inst_SOPC__S_CMP_LG_U64
+
+    Inst_SOPC__S_CMP_LG_U64::~Inst_SOPC__S_CMP_LG_U64()
+    {
+    } // ~Inst_SOPC__S_CMP_LG_U64
+
+    // --- description from .arch file ---
+    // SCC = (S0.i64 != S1.i64).
+    void
+    Inst_SOPC__S_CMP_LG_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandI64 src0(gpuDynInst, instData.SSRC0);
+        ConstScalarOperandI64 src1(gpuDynInst, instData.SSRC1);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src0.read();
+        src1.read();
+
+        scc = (src0.rawData() != src1.rawData()) ? 1 : 0;
+
+        scc.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sopk.cc b/src/arch/amdgpu/vega/insts/sopk.cc
new file mode 100644
index 0000000000..7abbb9abb4
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sopk.cc
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "dev/amdgpu/hwreg_defines.hh"
+#include "gpu-compute/shader.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPK__S_MOVK_I32 class methods ---
+
+    Inst_SOPK__S_MOVK_I32::Inst_SOPK__S_MOVK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_movk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_MOVK_I32
+
+    Inst_SOPK__S_MOVK_I32::~Inst_SOPK__S_MOVK_I32()
+    {
+    } // ~Inst_SOPK__S_MOVK_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(SIMM16) (sign extension).
+    void
+    Inst_SOPK__S_MOVK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        sdst = simm16;
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_CMOVK_I32 class methods ---
+
+    Inst_SOPK__S_CMOVK_I32::Inst_SOPK__S_CMOVK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmovk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMOVK_I32
+
+    Inst_SOPK__S_CMOVK_I32::~Inst_SOPK__S_CMOVK_I32()
+    {
+    } // ~Inst_SOPK__S_CMOVK_I32
+
+    // --- description from .arch file ---
+    // if (SCC) then D.i = signext(SIMM16);
+    // else NOP.
+    // Conditional move with sign extension.
+    void
+    Inst_SOPK__S_CMOVK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (scc.rawData()) {
+            sdst = simm16;
+            sdst.write();
+        }
+    } // execute
+    // --- Inst_SOPK__S_CMPK_EQ_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_EQ_I32::Inst_SOPK__S_CMPK_EQ_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_eq_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_EQ_I32
+
+    Inst_SOPK__S_CMPK_EQ_I32::~Inst_SOPK__S_CMPK_EQ_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_EQ_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i == signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() == simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LG_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LG_I32::Inst_SOPK__S_CMPK_LG_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lg_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LG_I32
+
+    Inst_SOPK__S_CMPK_LG_I32::~Inst_SOPK__S_CMPK_LG_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LG_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i != signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LG_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() != simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GT_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_GT_I32::Inst_SOPK__S_CMPK_GT_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_gt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GT_I32
+
+    Inst_SOPK__S_CMPK_GT_I32::~Inst_SOPK__S_CMPK_GT_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i > signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() > simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GE_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_GE_I32::Inst_SOPK__S_CMPK_GE_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_ge_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GE_I32
+
+    Inst_SOPK__S_CMPK_GE_I32::~Inst_SOPK__S_CMPK_GE_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i >= signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() >= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LT_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LT_I32::Inst_SOPK__S_CMPK_LT_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LT_I32
+
+    Inst_SOPK__S_CMPK_LT_I32::~Inst_SOPK__S_CMPK_LT_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LT_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i < signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() < simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LE_I32 class methods ---
+
+    Inst_SOPK__S_CMPK_LE_I32::Inst_SOPK__S_CMPK_LE_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_le_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LE_I32
+
+    Inst_SOPK__S_CMPK_LE_I32::~Inst_SOPK__S_CMPK_LE_I32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LE_I32
+
+    // --- description from .arch file ---
+    // SCC = (S0.i <= signext(SIMM16)).
+    void
+    Inst_SOPK__S_CMPK_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)sext<16>(instData.SIMM16);
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() <= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_EQ_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_EQ_U32::Inst_SOPK__S_CMPK_EQ_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_eq_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_EQ_U32
+
+    Inst_SOPK__S_CMPK_EQ_U32::~Inst_SOPK__S_CMPK_EQ_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_EQ_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u == SIMM16).
+    void
+    Inst_SOPK__S_CMPK_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() == simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LG_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LG_U32::Inst_SOPK__S_CMPK_LG_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lg_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LG_U32
+
+    Inst_SOPK__S_CMPK_LG_U32::~Inst_SOPK__S_CMPK_LG_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LG_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u != SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LG_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() != simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GT_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_GT_U32::Inst_SOPK__S_CMPK_GT_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_gt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GT_U32
+
+    Inst_SOPK__S_CMPK_GT_U32::~Inst_SOPK__S_CMPK_GT_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u > SIMM16).
+    void
+    Inst_SOPK__S_CMPK_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() > simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_GE_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_GE_U32::Inst_SOPK__S_CMPK_GE_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_ge_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_GE_U32
+
+    Inst_SOPK__S_CMPK_GE_U32::~Inst_SOPK__S_CMPK_GE_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_GE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u >= SIMM16).
+    void
+    Inst_SOPK__S_CMPK_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() >= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LT_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LT_U32::Inst_SOPK__S_CMPK_LT_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_lt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LT_U32
+
+    Inst_SOPK__S_CMPK_LT_U32::~Inst_SOPK__S_CMPK_LT_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LT_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u < SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() < simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_CMPK_LE_U32 class methods ---
+
+    Inst_SOPK__S_CMPK_LE_U32::Inst_SOPK__S_CMPK_LE_U32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cmpk_le_u32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_CMPK_LE_U32
+
+    Inst_SOPK__S_CMPK_LE_U32::~Inst_SOPK__S_CMPK_LE_U32()
+    {
+    } // ~Inst_SOPK__S_CMPK_LE_U32
+
+    // --- description from .arch file ---
+    // SCC = (S0.u <= SIMM16).
+    void
+    Inst_SOPK__S_CMPK_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU32 simm16 = (ScalarRegU32)instData.SIMM16;
+        ConstScalarOperandU32 src(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        scc = (src.rawData() <= simm16) ? 1 : 0;
+
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_ADDK_I32 class methods ---
+
+    Inst_SOPK__S_ADDK_I32::Inst_SOPK__S_ADDK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_addk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_ADDK_I32
+
+    Inst_SOPK__S_ADDK_I32::~Inst_SOPK__S_ADDK_I32()
+    {
+    } // ~Inst_SOPK__S_ADDK_I32
+
+    // --- description from .arch file ---
+    // D.i = D.i + signext(SIMM16);
+    // SCC = overflow.
+    void
+    Inst_SOPK__S_ADDK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        src.read();
+
+        sdst = src.rawData() + (ScalarRegI32)sext<16>(simm16);
+        scc = (bits(src.rawData(), 31) == bits(simm16, 15)
+            && bits(src.rawData(), 31) != bits(sdst.rawData(), 31)) ? 1 : 0;
+
+        sdst.write();
+        scc.write();
+    } // execute
+    // --- Inst_SOPK__S_MULK_I32 class methods ---
+
+    Inst_SOPK__S_MULK_I32::Inst_SOPK__S_MULK_I32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_mulk_i32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_MULK_I32
+
+    Inst_SOPK__S_MULK_I32::~Inst_SOPK__S_MULK_I32()
+    {
+    } // ~Inst_SOPK__S_MULK_I32
+
+    // --- description from .arch file ---
+    // D.i = D.i * signext(SIMM16).
+    void
+    Inst_SOPK__S_MULK_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandI32 src(gpuDynInst, instData.SDST);
+        ScalarOperandI32 sdst(gpuDynInst, instData.SDST);
+
+        src.read();
+
+        sdst = src.rawData() * (ScalarRegI32)sext<16>(simm16);
+
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_CBRANCH_I_FORK class methods ---
+
+    Inst_SOPK__S_CBRANCH_I_FORK::Inst_SOPK__S_CBRANCH_I_FORK(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_cbranch_i_fork")
+    {
+        setFlag(Branch);
+    } // Inst_SOPK__S_CBRANCH_I_FORK
+
+    Inst_SOPK__S_CBRANCH_I_FORK::~Inst_SOPK__S_CBRANCH_I_FORK()
+    {
+    } // ~Inst_SOPK__S_CBRANCH_I_FORK
+
+    // --- description from .arch file ---
+    // mask_pass = S0.u64 & EXEC;
+    // mask_fail = ~S0.u64 & EXEC;
+    // target_addr = PC + signext(SIMM16 * 4) + 4;
+    // if (mask_pass == EXEC)
+    //     PC = target_addr;
+    // elsif (mask_fail == EXEC)
+    //     PC += 4;
+    // elsif (bitcount(mask_fail) < bitcount(mask_pass))
+    //     EXEC = mask_fail;
+    //     SGPR[CSP*4] = { target_addr, mask_pass };
+    //     CSP++;
+    //     PC += 4;
+    // else
+    //     EXEC = mask_pass;
+    //     SGPR[CSP*4] = { PC + 4, mask_fail };
+    //     CSP++;
+    //     PC = target_addr;
+    // end.
+    // Conditional branch using branch-stack.
+    // S0 = compare mask(vcc or any sgpr), and
+    // SIMM16 = signed DWORD branch offset relative to next instruction.
+    // See also S_CBRANCH_JOIN.
+    void
+    Inst_SOPK__S_CBRANCH_I_FORK::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPK__S_GETREG_B32 class methods ---
+
+    Inst_SOPK__S_GETREG_B32::Inst_SOPK__S_GETREG_B32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_getreg_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_GETREG_B32
+
+    Inst_SOPK__S_GETREG_B32::~Inst_SOPK__S_GETREG_B32()
+    {
+    } // ~Inst_SOPK__S_GETREG_B32
+
+    // --- description from .arch file ---
+    // D.u = hardware-reg. Read some or all of a hardware register into the
+    // LSBs of D.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_GETREG_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        sdst.read();
+
+        // Store value from hardware to part of the SDST.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        sdst = (hwreg & mask) >> offset;
+        sdst.write();
+    } // execute
+    // --- Inst_SOPK__S_SETREG_B32 class methods ---
+
+    Inst_SOPK__S_SETREG_B32::Inst_SOPK__S_SETREG_B32(InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_setreg_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_SETREG_B32
+
+    Inst_SOPK__S_SETREG_B32::~Inst_SOPK__S_SETREG_B32()
+    {
+    } // ~Inst_SOPK__S_SETREG_B32
+
+    // --- description from .arch file ---
+    // hardware-reg = S0.u. Write some or all of the LSBs of D into a hardware
+    // register.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_SETREG_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarOperandU32 sdst(gpuDynInst, instData.SDST);
+        sdst.read();
+
+        // Store value from SDST to part of the hardware register.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        hwreg = ((hwreg & ~mask) | ((sdst.rawData() << offset) & mask));
+        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
+
+        // set MODE register to control the behavior of single precision
+        // floating-point numbers: denormal mode or round mode
+        if (hwregId==1 && size==2
+                        && (offset==4 || offset==0)) {
+            warn_once("Be cautious that s_setreg_b32 has no real effect "
+                            "on FP modes: %s\n", gpuDynInst->disassemble());
+            return;
+        }
+
+        // panic if not changing MODE of floating-point numbers
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPK__S_SETREG_IMM32_B32 class methods ---
+
+    Inst_SOPK__S_SETREG_IMM32_B32::Inst_SOPK__S_SETREG_IMM32_B32(
+          InFmt_SOPK *iFmt)
+        : Inst_SOPK(iFmt, "s_setreg_imm32_b32")
+    {
+        setFlag(ALU);
+    } // Inst_SOPK__S_SETREG_IMM32_B32
+
+    Inst_SOPK__S_SETREG_IMM32_B32::~Inst_SOPK__S_SETREG_IMM32_B32()
+    {
+    } // ~Inst_SOPK__S_SETREG_IMM32_B32
+
+    // --- description from .arch file ---
+    // Write some or all of the LSBs of IMM32 into a hardware register; this
+    // ---  instruction requires a 32-bit literal constant.
+    // SIMM16 = {size[4:0], offset[4:0], hwRegId[5:0]}; offset is 0..31, size
+    // is 1..32.
+    void
+    Inst_SOPK__S_SETREG_IMM32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ScalarRegU32 hwregId = simm16 & 0x3f;
+        ScalarRegU32 offset = (simm16 >> 6) & 31;
+        ScalarRegU32 size = ((simm16 >> 11) & 31) + 1;
+
+        ScalarRegU32 hwreg =
+            gpuDynInst->computeUnit()->shader->getHwReg(hwregId);
+        ScalarRegI32 simm32 = extData.imm_u32;
+
+        // Store value from SIMM32 to part of the hardware register.
+        ScalarRegU32 mask = (((1U << size) - 1U) << offset);
+        hwreg = ((hwreg & ~mask) | ((simm32 << offset) & mask));
+        gpuDynInst->computeUnit()->shader->setHwReg(hwregId, hwreg);
+
+        // set MODE register to control the behavior of single precision
+        // floating-point numbers: denormal mode or round mode
+        if (hwregId==HW_REG_MODE && size==2
+                        && (offset==4 || offset==0)) {
+            warn_once("Be cautious that s_setreg_imm32_b32 has no real effect "
+                            "on FP modes: %s\n", gpuDynInst->disassemble());
+            return;
+        }
+
+        // panic if not changing modes of single-precision FPs
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/sopp.cc b/src/arch/amdgpu/vega/insts/sopp.cc
new file mode 100644
index 0000000000..781113b204
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/sopp.cc
@@ -0,0 +1,922 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "debug/GPUSync.hh"
+#include "gpu-compute/shader.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_SOPP__S_NOP class methods ---
+
+    Inst_SOPP__S_NOP::Inst_SOPP__S_NOP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_nop")
+    {
+        setFlag(Nop);
+    } // Inst_SOPP__S_NOP
+
+    Inst_SOPP__S_NOP::~Inst_SOPP__S_NOP()
+    {
+    } // ~Inst_SOPP__S_NOP
+
+    // --- description from .arch file ---
+    // Do nothing. Repeat NOP 1..8 times based on SIMM16[2:0] -- 0 = 1 time,
+    // 7 = 8 times.
+    // This instruction may be used to introduce wait states to resolve
+    // hazards; see the shader programming guide for details. Compare with
+    // S_SLEEP.
+    void
+    Inst_SOPP__S_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_SOPP__S_ENDPGM class methods ---
+
+    Inst_SOPP__S_ENDPGM::Inst_SOPP__S_ENDPGM(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_endpgm")
+    {
+        setFlag(EndOfKernel);
+    } // Inst_SOPP__S_ENDPGM
+
+    Inst_SOPP__S_ENDPGM::~Inst_SOPP__S_ENDPGM()
+    {
+    } // ~Inst_SOPP__S_ENDPGM
+
+    // --- description from .arch file ---
+    // End of program; terminate wavefront.
+    // The hardware implicitly executes S_WAITCNT 0 before executing this
+    // ---  instruction.
+    // See S_ENDPGM_SAVED for the context-switch version of this instruction.
+    void
+    Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
+
+        // delete extra instructions fetched for completed work-items
+        wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
+            wf->instructionBuffer.end());
+
+        if (wf->pendingFetch) {
+            wf->dropFetch = true;
+        }
+
+        wf->computeUnit->fetchStage.fetchUnit(wf->simdId)
+            .flushBuf(wf->wfSlotId);
+        wf->setStatus(Wavefront::S_STOPPED);
+
+        int refCount = wf->computeUnit->getLds()
+            .decreaseRefCounter(wf->dispatchId, wf->wgId);
+
+        /**
+         * The parent WF of this instruction is exiting, therefore
+         * it should not participate in this barrier any longer. This
+         * prevents possible deadlock issues if WFs exit early.
+         */
+        int bar_id = WFBarrier::InvalidID;
+        if (wf->hasBarrier()) {
+            assert(wf->getStatus() != Wavefront::S_BARRIER);
+            bar_id = wf->barrierId();
+            assert(bar_id != WFBarrier::InvalidID);
+            wf->releaseBarrier();
+            cu->decMaxBarrierCnt(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
+                    "program and decrementing max barrier count for "
+                    "barrier Id%d. New max count: %d.\n", cu->cu_id,
+                    wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
+                    cu->maxBarrierCnt(bar_id));
+        }
+
+        DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
+            wf->computeUnit->cu_id, wf->wgId, refCount);
+
+        wf->computeUnit->registerManager->freeRegisters(wf);
+        wf->computeUnit->stats.completedWfs++;
+        wf->computeUnit->activeWaves--;
+
+        panic_if(wf->computeUnit->activeWaves < 0, "CU[%d] Active waves less "
+            "than zero\n", wf->computeUnit->cu_id);
+
+        DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
+            wf->computeUnit->cu_id, wf->simdId, wf->wfSlotId, wf->wfDynId);
+
+        for (int i = 0; i < wf->vecReads.size(); i++) {
+            if (wf->rawDist.find(i) != wf->rawDist.end()) {
+                wf->stats.readsPerWrite.sample(wf->vecReads.at(i));
+            }
+        }
+        wf->vecReads.clear();
+        wf->rawDist.clear();
+        wf->lastInstExec = 0;
+
+        if (!refCount) {
+            /**
+             * If all WFs have finished, and hence the WG has finished,
+             * then we can free up the barrier belonging to the parent
+             * WG, but only if we actually used a barrier (i.e., more
+             * than one WF in the WG).
+             */
+            if (bar_id != WFBarrier::InvalidID) {
+                DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
+                        "now complete. Releasing barrier Id%d.\n", cu->cu_id,
+                        wf->simdId, wf->wfSlotId, wf->wfDynId,
+                        wf->barrierId());
+                cu->releaseBarrier(bar_id);
+            }
+
+           /**
+             * Last wavefront of the workgroup has executed return. If the
+             * workgroup is not the final one in the kernel, then simply
+             * retire it; however, if it is the final one, i.e., indicating
+             * the kernel end, then release operation (i.e., GL2 WB) is
+             * needed
+             */
+
+            //check whether the workgroup is indicating the kernel end, i.e.,
+            //the last workgroup in the kernel
+            bool kernelEnd =
+                wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
+
+            bool relNeeded =
+                wf->computeUnit->shader->impl_kern_end_rel;
+
+            //if it is not a kernel end, then retire the workgroup directly
+            if (!kernelEnd || !relNeeded) {
+                wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
+                wf->setStatus(Wavefront::S_STOPPED);
+                wf->computeUnit->stats.completedWGs++;
+
+                return;
+            }
+
+            /**
+             * if it is a kernel end, inject a memory sync, i.e., GL2 WB, and
+             * retire the workgroup after receving response.
+             * note that GL0V and GL1 are read only, and they just forward GL2
+             * WB request. When forwarding, GL1 send the request to all GL2 in
+             * the complex
+             */
+            setFlag(MemSync);
+            setFlag(GlobalSegment);
+            // Notify Memory System of Kernel Completion
+            // Kernel End = isKernel + isMemSync
+            wf->setStatus(Wavefront::S_RETURNING);
+            gpuDynInst->simdId = wf->simdId;
+            gpuDynInst->wfSlotId = wf->wfSlotId;
+            gpuDynInst->wfDynId = wf->wfDynId;
+
+            DPRINTF(GPUExec, "inject global memory fence for CU%d: "
+                            "WF[%d][%d][%d]\n", wf->computeUnit->cu_id,
+                            wf->simdId, wf->wfSlotId, wf->wfDynId);
+
+            // call shader to prepare the flush operations
+            wf->computeUnit->shader->prepareFlush(gpuDynInst);
+
+            wf->computeUnit->stats.completedWGs++;
+        } else {
+            wf->computeUnit->shader->dispatcher().scheduleDispatch();
+        }
+    } // execute
+
+    // --- Inst_SOPP__S_BRANCH class methods ---
+
+    Inst_SOPP__S_BRANCH::Inst_SOPP__S_BRANCH(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_branch")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_BRANCH
+
+    Inst_SOPP__S_BRANCH::~Inst_SOPP__S_BRANCH()
+    {
+    } // ~Inst_SOPP__S_BRANCH
+
+    // --- description from .arch file ---
+    // PC = PC + signext(SIMM16 * 4) + 4 (short jump).
+    // For a long jump, use S_SETPC.
+    void
+    Inst_SOPP__S_BRANCH::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+
+        pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_WAKEUP class methods ---
+
+    Inst_SOPP__S_WAKEUP::Inst_SOPP__S_WAKEUP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_wakeup")
+    {
+    } // Inst_SOPP__S_WAKEUP
+
+    Inst_SOPP__S_WAKEUP::~Inst_SOPP__S_WAKEUP()
+    {
+    } // ~Inst_SOPP__S_WAKEUP
+
+    // --- description from .arch file ---
+    // Allow a wave to 'ping' all the other waves in its threadgroup to force
+    // them to wake up immediately from an S_SLEEP instruction. The ping is
+    // ignored if the waves are not sleeping.
+    // This allows for more efficient polling on a memory location. The waves
+    // which are polling can sit in a long S_SLEEP between memory reads, but
+    // the wave which writes the value can tell them all to wake up early now
+    // that the data is available. This is useful for fBarrier implementations
+    // (speedup).
+    // This method is also safe from races because if any wave misses the ping,
+    // everything still works fine (whoever missed it just completes their
+    // normal S_SLEEP).
+    void
+    Inst_SOPP__S_WAKEUP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_SCC0 class methods ---
+
+    Inst_SOPP__S_CBRANCH_SCC0::Inst_SOPP__S_CBRANCH_SCC0(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_scc0")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_SCC0
+
+    Inst_SOPP__S_CBRANCH_SCC0::~Inst_SOPP__S_CBRANCH_SCC0()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_SCC0
+
+    // --- description from .arch file ---
+    // if (SCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_SCC0::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (!scc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_SCC1 class methods ---
+
+    Inst_SOPP__S_CBRANCH_SCC1::Inst_SOPP__S_CBRANCH_SCC1(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_scc1")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_SCC1
+
+    Inst_SOPP__S_CBRANCH_SCC1::~Inst_SOPP__S_CBRANCH_SCC1()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_SCC1
+
+    // --- description from .arch file ---
+    // if (SCC == 1) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_SCC1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+        ConstScalarOperandU32 scc(gpuDynInst, REG_SCC);
+
+        scc.read();
+
+        if (scc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_VCCZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_VCCZ::Inst_SOPP__S_CBRANCH_VCCZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_vccz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsVCC);
+    } // Inst_SOPP__S_CBRANCH_VCCZ
+
+    Inst_SOPP__S_CBRANCH_VCCZ::~Inst_SOPP__S_CBRANCH_VCCZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_VCCZ
+
+    // --- description from .arch file ---
+    // if (VCC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_VCCZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+        Addr pc = gpuDynInst->pc();
+        ScalarRegI16 simm16 = instData.SIMM16;
+
+        vcc.read();
+
+        if (!vcc.rawData()) {
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+        }
+
+        wf->pc(pc);
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_VCCNZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_VCCNZ::Inst_SOPP__S_CBRANCH_VCCNZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_vccnz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsVCC);
+    } // Inst_SOPP__S_CBRANCH_VCCNZ
+
+    Inst_SOPP__S_CBRANCH_VCCNZ::~Inst_SOPP__S_CBRANCH_VCCNZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_VCCNZ
+
+    // --- description from .arch file ---
+    // if (VCC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_VCCNZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        vcc.read();
+
+        if (vcc.rawData()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_EXECZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_EXECZ::Inst_SOPP__S_CBRANCH_EXECZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_execz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsEXEC);
+    } // Inst_SOPP__S_CBRANCH_EXECZ
+
+    Inst_SOPP__S_CBRANCH_EXECZ::~Inst_SOPP__S_CBRANCH_EXECZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_EXECZ
+
+    // --- description from .arch file ---
+    // if (EXEC == 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_EXECZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (wf->execMask().none()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_EXECNZ class methods ---
+
+    Inst_SOPP__S_CBRANCH_EXECNZ::Inst_SOPP__S_CBRANCH_EXECNZ(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_execnz")
+    {
+        setFlag(Branch);
+        setFlag(ReadsEXEC);
+    } // Inst_SOPP__S_CBRANCH_EXECNZ
+
+    Inst_SOPP__S_CBRANCH_EXECNZ::~Inst_SOPP__S_CBRANCH_EXECNZ()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_EXECNZ
+
+    // --- description from .arch file ---
+    // if (EXEC != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_EXECNZ::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (wf->execMask().any()) {
+            Addr pc = gpuDynInst->pc();
+            ScalarRegI16 simm16 = instData.SIMM16;
+            pc = pc + ((ScalarRegI64)simm16 * 4LL) + 4LL;
+            wf->pc(pc);
+        }
+    } // execute
+    // --- Inst_SOPP__S_BARRIER class methods ---
+
+    Inst_SOPP__S_BARRIER::Inst_SOPP__S_BARRIER(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_barrier")
+    {
+        setFlag(MemBarrier);
+    } // Inst_SOPP__S_BARRIER
+
+    Inst_SOPP__S_BARRIER::~Inst_SOPP__S_BARRIER()
+    {
+    } // ~Inst_SOPP__S_BARRIER
+
+    // --- description from .arch file ---
+    // Synchronize waves within a threadgroup.
+    // If not all waves of the threadgroup have been created yet, waits for
+    // entire group before proceeding.
+    // If some waves in the threadgroup have already terminated, this waits on
+    // only the surviving waves.
+    // Barriers are legal inside trap handlers.
+    void
+    Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
+
+        if (wf->hasBarrier()) {
+            int bar_id = wf->barrierId();
+            assert(wf->getStatus() == Wavefront::S_BARRIER);
+            cu->incNumAtBarrier(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
+                    "barrier Id%d. %d waves now at barrier, %d waves "
+                    "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
+                    wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
+                    cu->numYetToReachBarrier(bar_id));
+        }
+    } // execute
+    // --- Inst_SOPP__S_SETKILL class methods ---
+
+    Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_setkill")
+    {
+    } // Inst_SOPP__S_SETKILL
+
+    Inst_SOPP__S_SETKILL::~Inst_SOPP__S_SETKILL()
+    {
+    } // ~Inst_SOPP__S_SETKILL
+
+    // --- description from .arch file ---
+    // set KILL bit to value of SIMM16[0].
+    // Used primarily for debugging kill wave host command behavior.
+    void
+    Inst_SOPP__S_SETKILL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_WAITCNT class methods ---
+
+    Inst_SOPP__S_WAITCNT::Inst_SOPP__S_WAITCNT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_waitcnt")
+    {
+        setFlag(ALU);
+        setFlag(Waitcnt);
+    } // Inst_SOPP__S_WAITCNT
+
+    Inst_SOPP__S_WAITCNT::~Inst_SOPP__S_WAITCNT()
+    {
+    } // ~Inst_SOPP__S_WAITCNT
+
+    // --- description from .arch file ---
+    // Wait for the counts of outstanding lds, vector-memory and
+    // ---  export/vmem-write-data to be at or below the specified levels.
+    // SIMM16[3:0] = vmcount (vector memory operations),
+    // SIMM16[6:4] = export/mem-write-data count,
+    // SIMM16[12:8] = LGKM_cnt (scalar-mem/GDS/LDS count).
+    void
+    Inst_SOPP__S_WAITCNT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 vm_cnt = 0;
+        ScalarRegI32 exp_cnt = 0;
+        ScalarRegI32 lgkm_cnt = 0;
+        vm_cnt = bits<ScalarRegI16>(instData.SIMM16, 3, 0);
+        exp_cnt = bits<ScalarRegI16>(instData.SIMM16, 6, 4);
+        lgkm_cnt = bits<ScalarRegI16>(instData.SIMM16, 12, 8);
+        gpuDynInst->wavefront()->setStatus(Wavefront::S_WAITCNT);
+        gpuDynInst->wavefront()->setWaitCnts(vm_cnt, exp_cnt, lgkm_cnt);
+    } // execute
+    // --- Inst_SOPP__S_SETHALT class methods ---
+
+    Inst_SOPP__S_SETHALT::Inst_SOPP__S_SETHALT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sethalt")
+    {
+    } // Inst_SOPP__S_SETHALT
+
+    Inst_SOPP__S_SETHALT::~Inst_SOPP__S_SETHALT()
+    {
+    } // ~Inst_SOPP__S_SETHALT
+
+    // --- description from .arch file ---
+    // Set HALT bit to value of SIMM16[0]; 1 = halt, 0 = resume.
+    // The halt flag is ignored while PRIV == 1 (inside trap handlers) but the
+    // shader will halt immediately after the handler returns if HALT is still
+    // set at that time.
+    void
+    Inst_SOPP__S_SETHALT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SLEEP class methods ---
+
+    Inst_SOPP__S_SLEEP::Inst_SOPP__S_SLEEP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sleep")
+    {
+        setFlag(ALU);
+        setFlag(Sleep);
+    } // Inst_SOPP__S_SLEEP
+
+    Inst_SOPP__S_SLEEP::~Inst_SOPP__S_SLEEP()
+    {
+    } // ~Inst_SOPP__S_SLEEP
+
+    // --- description from .arch file ---
+    // Cause a wave to sleep for (64 * SIMM16[2:0] + 1..64) clocks.
+    // The exact amount of delay is approximate. Compare with S_NOP.
+    void
+    Inst_SOPP__S_SLEEP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegI32 simm16 = (ScalarRegI32)instData.SIMM16;
+        gpuDynInst->wavefront()->setStatus(Wavefront::S_STALLED_SLEEP);
+        // sleep duration is specified in multiples of 64 cycles
+        gpuDynInst->wavefront()->setSleepTime(64 * simm16);
+    } // execute
+    // --- Inst_SOPP__S_SETPRIO class methods ---
+
+    Inst_SOPP__S_SETPRIO::Inst_SOPP__S_SETPRIO(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_setprio")
+    {
+        setFlag(ALU);
+    } // Inst_SOPP__S_SETPRIO
+
+    Inst_SOPP__S_SETPRIO::~Inst_SOPP__S_SETPRIO()
+    {
+    } // ~Inst_SOPP__S_SETPRIO
+
+    // --- description from .arch file ---
+    // User settable wave priority is set to SIMM16[1:0]. 0 = lowest,
+    // 3 = highest.
+    // The overall wave priority is {SPIPrio[1:0] + UserPrio[1:0],
+    // WaveAge[3:0]}.
+    void
+    Inst_SOPP__S_SETPRIO::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ScalarRegU16 simm16 = instData.SIMM16;
+        ScalarRegU32 userPrio = simm16 & 0x3;
+
+        warn_once("S_SETPRIO ignored -- Requested priority %d\n", userPrio);
+    } // execute
+    // --- Inst_SOPP__S_SENDMSG class methods ---
+
+    Inst_SOPP__S_SENDMSG::Inst_SOPP__S_SENDMSG(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sendmsg")
+    {
+    } // Inst_SOPP__S_SENDMSG
+
+    Inst_SOPP__S_SENDMSG::~Inst_SOPP__S_SENDMSG()
+    {
+    } // ~Inst_SOPP__S_SENDMSG
+
+    // --- description from .arch file ---
+    // Send a message upstream to VGT or the interrupt handler.
+    // SIMM16[9:0] contains the message type and is documented in the shader
+    // ---  programming guide.
+    void
+    Inst_SOPP__S_SENDMSG::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SENDMSGHALT class methods ---
+
+    Inst_SOPP__S_SENDMSGHALT::Inst_SOPP__S_SENDMSGHALT(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_sendmsghalt")
+    {
+    } // Inst_SOPP__S_SENDMSGHALT
+
+    Inst_SOPP__S_SENDMSGHALT::~Inst_SOPP__S_SENDMSGHALT()
+    {
+    } // ~Inst_SOPP__S_SENDMSGHALT
+
+    // --- description from .arch file ---
+    // Send a message and then HALT the wavefront; see S_SENDMSG for details.
+    void
+    Inst_SOPP__S_SENDMSGHALT::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_TRAP class methods ---
+
+    Inst_SOPP__S_TRAP::Inst_SOPP__S_TRAP(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_trap")
+    {
+    } // Inst_SOPP__S_TRAP
+
+    Inst_SOPP__S_TRAP::~Inst_SOPP__S_TRAP()
+    {
+    } // ~Inst_SOPP__S_TRAP
+
+    // --- description from .arch file ---
+    // TrapID = SIMM16[7:0];
+    // Wait for all instructions to complete;
+    // set {TTMP1, TTMP0} = {3'h0, PCRewind[3:0], HT[0], TrapID[7:0],
+    // PC[47:0]};
+    // PC = TBA (trap base address);
+    // PRIV = 1.
+    // Enter the trap handler. This instruction may be generated internally as
+    // well in response to a host trap (HT = 1) or an exception.
+    // TrapID 0 is reserved for hardware use and should not be used in a
+    // shader-generated trap.
+    void
+    Inst_SOPP__S_TRAP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_ICACHE_INV class methods ---
+
+    Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_icache_inv")
+    {
+        setFlag(MemBarrier);
+        setFlag(GPUStaticInst::MemSync);
+        setFlag(MemSync);
+    } // Inst_SOPP__S_ICACHE_INV
+
+    Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
+    {
+    } // ~Inst_SOPP__S_ICACHE_INV
+
+    // --- description from .arch file ---
+    // Invalidate entire L1 instruction cache.
+    // You must have 12 separate S_NOP instructions or a jump/branch
+    // instruction after this instruction
+    // to ensure the SQ instruction buffer is purged.
+    void
+    Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        gpuDynInst->resetEntireStatusVector();
+        gpuDynInst->setStatusVector(0, 1);
+        RequestPtr req = std::make_shared<Request>(0, 0, 0,
+                                   gpuDynInst->computeUnit()->
+                                   requestorId(), 0,
+                                   gpuDynInst->wfDynId);
+        gpuDynInst->setRequestFlags(req);
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            injectScalarMemFence(gpuDynInst, false, req);
+    } // execute
+    // --- Inst_SOPP__S_INCPERFLEVEL class methods ---
+
+    Inst_SOPP__S_INCPERFLEVEL::Inst_SOPP__S_INCPERFLEVEL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_incperflevel")
+    {
+    } // Inst_SOPP__S_INCPERFLEVEL
+
+    Inst_SOPP__S_INCPERFLEVEL::~Inst_SOPP__S_INCPERFLEVEL()
+    {
+    } // ~Inst_SOPP__S_INCPERFLEVEL
+
+    // --- description from .arch file ---
+    // Increment performance counter specified in SIMM16[3:0] by 1.
+    void
+    Inst_SOPP__S_INCPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_DECPERFLEVEL class methods ---
+
+    Inst_SOPP__S_DECPERFLEVEL::Inst_SOPP__S_DECPERFLEVEL(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_decperflevel")
+    {
+    } // Inst_SOPP__S_DECPERFLEVEL
+
+    Inst_SOPP__S_DECPERFLEVEL::~Inst_SOPP__S_DECPERFLEVEL()
+    {
+    } // ~Inst_SOPP__S_DECPERFLEVEL
+
+    // --- description from .arch file ---
+    // Decrement performance counter specified in SIMM16[3:0] by 1.
+    void
+    Inst_SOPP__S_DECPERFLEVEL::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_TTRACEDATA class methods ---
+
+    Inst_SOPP__S_TTRACEDATA::Inst_SOPP__S_TTRACEDATA(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_ttracedata")
+    {
+    } // Inst_SOPP__S_TTRACEDATA
+
+    Inst_SOPP__S_TTRACEDATA::~Inst_SOPP__S_TTRACEDATA()
+    {
+    } // ~Inst_SOPP__S_TTRACEDATA
+
+    // --- description from .arch file ---
+    // Send M0 as user data to the thread trace stream.
+    void
+    Inst_SOPP__S_TTRACEDATA::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS::Inst_SOPP__S_CBRANCH_CDBGSYS(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS::~Inst_SOPP__S_CBRANCH_CDBGSYS()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system != 0) then PC = PC + signext(SIMM16 * 4)
+    // + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGUSER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGUSER::Inst_SOPP__S_CBRANCH_CDBGUSER(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbguser")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGUSER
+
+    Inst_SOPP__S_CBRANCH_CDBGUSER::~Inst_SOPP__S_CBRANCH_CDBGUSER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGUSER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_user != 0) then PC = PC + signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGUSER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_or_user")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::
+        ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system || conditional_debug_user) then PC = PC +
+    // ---  signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS_OR_USER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER class methods ---
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
+        Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER(InFmt_SOPP *iFmt)
+            : Inst_SOPP(iFmt, "s_cbranch_cdbgsys_and_user")
+    {
+        setFlag(Branch);
+    } // Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
+
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::
+        ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER()
+    {
+    } // ~Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER
+
+    // --- description from .arch file ---
+    // if (conditional_debug_system && conditional_debug_user) then PC = PC +
+    // ---  signext(SIMM16 * 4) + 4;
+    // else NOP.
+    void
+    Inst_SOPP__S_CBRANCH_CDBGSYS_AND_USER::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_ENDPGM_SAVED class methods ---
+
+    Inst_SOPP__S_ENDPGM_SAVED::Inst_SOPP__S_ENDPGM_SAVED(InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_endpgm_saved")
+    {
+    } // Inst_SOPP__S_ENDPGM_SAVED
+
+    Inst_SOPP__S_ENDPGM_SAVED::~Inst_SOPP__S_ENDPGM_SAVED()
+    {
+    } // ~Inst_SOPP__S_ENDPGM_SAVED
+
+    // --- description from .arch file ---
+    // End of program; signal that a wave has been saved by the context-switch
+    // trap handler and terminate wavefront.
+    // The hardware implicitly executes S_WAITCNT 0 before executing this
+    // instruction.
+    // Use S_ENDPGM in all cases unless you are executing the context-switch
+    // save handler.
+    void
+    Inst_SOPP__S_ENDPGM_SAVED::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SET_GPR_IDX_OFF class methods ---
+
+    Inst_SOPP__S_SET_GPR_IDX_OFF::Inst_SOPP__S_SET_GPR_IDX_OFF(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_set_gpr_idx_off")
+    {
+    } // Inst_SOPP__S_SET_GPR_IDX_OFF
+
+    Inst_SOPP__S_SET_GPR_IDX_OFF::~Inst_SOPP__S_SET_GPR_IDX_OFF()
+    {
+    } // ~Inst_SOPP__S_SET_GPR_IDX_OFF
+
+    // --- description from .arch file ---
+    // MODE.gpr_idx_en = 0.
+    // Clear GPR indexing mode. Vector operations after this will not perform
+    // ---  relative GPR addressing regardless of the contents of M0. This
+    // ---  instruction does not modify M0.
+    void
+    Inst_SOPP__S_SET_GPR_IDX_OFF::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_SOPP__S_SET_GPR_IDX_MODE class methods ---
+
+    Inst_SOPP__S_SET_GPR_IDX_MODE::Inst_SOPP__S_SET_GPR_IDX_MODE(
+          InFmt_SOPP *iFmt)
+        : Inst_SOPP(iFmt, "s_set_gpr_idx_mode")
+    {
+    } // Inst_SOPP__S_SET_GPR_IDX_MODE
+
+    Inst_SOPP__S_SET_GPR_IDX_MODE::~Inst_SOPP__S_SET_GPR_IDX_MODE()
+    {
+    } // ~Inst_SOPP__S_SET_GPR_IDX_MODE
+
+    // --- description from .arch file ---
+    // M0[15:12] = SIMM4.
+    // Modify the mode used for vector GPR indexing.
+    // The raw contents of the source field are read and used to set the enable
+    // bits. SIMM4[0] = VSRC0_REL, SIMM4[1] = VSRC1_REL, SIMM4[2] = VSRC2_REL
+    // and SIMM4[3] = VDST_REL.
+    void
+    Inst_SOPP__S_SET_GPR_IDX_MODE::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vinterp.cc b/src/arch/amdgpu/vega/insts/vinterp.cc
new file mode 100644
index 0000000000..784f6f2eb2
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vinterp.cc
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VINTRP__V_INTERP_P1_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_P1_F32::Inst_VINTRP__V_INTERP_P1_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_p1_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_P1_F32
+
+    Inst_VINTRP__V_INTERP_P1_F32::~Inst_VINTRP__V_INTERP_P1_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_P1_F32
+
+    // --- description from .arch file ---
+    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S;
+    // if D == S then data corruption will occur.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VINTRP__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VINTRP__V_INTERP_P2_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_P2_F32::Inst_VINTRP__V_INTERP_P2_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_p2_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_P2_F32
+
+    Inst_VINTRP__V_INTERP_P2_F32::~Inst_VINTRP__V_INTERP_P2_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_P2_F32
+
+    // --- description from .arch file ---
+    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VINTRP__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VINTRP__V_INTERP_MOV_F32 class methods ---
+
+    Inst_VINTRP__V_INTERP_MOV_F32::Inst_VINTRP__V_INTERP_MOV_F32(
+          InFmt_VINTRP *iFmt)
+        : Inst_VINTRP(iFmt, "v_interp_mov_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VINTRP__V_INTERP_MOV_F32
+
+    Inst_VINTRP__V_INTERP_MOV_F32::~Inst_VINTRP__V_INTERP_MOV_F32()
+    {
+    } // ~Inst_VINTRP__V_INTERP_MOV_F32
+
+    // --- description from .arch file ---
+    // D.f = {P10,P20,P0}[S.u]; parameter load.
+    void
+    Inst_VINTRP__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop1.cc b/src/arch/amdgpu/vega/insts/vop1.cc
new file mode 100644
index 0000000000..f970923951
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop1.cc
@@ -0,0 +1,2435 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP1__V_NOP class methods ---
+
+    Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_nop")
+    {
+        setFlag(Nop);
+        setFlag(ALU);
+    } // Inst_VOP1__V_NOP
+
+    Inst_VOP1__V_NOP::~Inst_VOP1__V_NOP()
+    {
+    } // ~Inst_VOP1__V_NOP
+
+    // --- description from .arch file ---
+    // Do nothing.
+    void
+    Inst_VOP1__V_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_VOP1__V_MOV_B32 class methods ---
+
+    Inst_VOP1__V_MOV_B32::Inst_VOP1__V_MOV_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_mov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_MOV_B32
+
+    Inst_VOP1__V_MOV_B32::~Inst_VOP1__V_MOV_B32()
+    {
+    } // ~Inst_VOP1__V_MOV_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP1__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (isDPPInst()) {
+            VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            // NOTE: For VOP1, there is no SRC1, so make sure we're not trying
+            // to negate it or take the absolute value of it
+            assert(!extData.iFmt_VOP_DPP.SRC1_ABS);
+            assert(!extData.iFmt_VOP_DPP.SRC1_NEG);
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src_dpp);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src_dpp[lane];
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_READFIRSTLANE_B32 class methods ---
+
+    Inst_VOP1__V_READFIRSTLANE_B32::Inst_VOP1__V_READFIRSTLANE_B32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_readfirstlane_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_READFIRSTLANE_B32
+
+    Inst_VOP1__V_READFIRSTLANE_B32::~Inst_VOP1__V_READFIRSTLANE_B32()
+    {
+    } // ~Inst_VOP1__V_READFIRSTLANE_B32
+
+    // --- description from .arch file ---
+    // Copy one VGPR value to one SGPR. D = SGPR destination, S0 = source data
+    // (VGPR# or M0 for lds direct access), Lane# = FindFirst1fromLSB(exec)
+    // (Lane# = 0 if exec is zero). Ignores exec mask for the access. SQ
+    // translates to V_READLANE_B32.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP1__V_READFIRSTLANE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarRegI32 src_lane(0);
+        ScalarRegU64 exec_mask = wf->execMask().to_ullong();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (exec_mask) {
+            src_lane = findLsbSet(exec_mask);
+        }
+
+        sdst = src[src_lane];
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_I32_F64 class methods ---
+
+    Inst_VOP1__V_CVT_I32_F64::Inst_VOP1__V_CVT_I32_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_i32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_I32_F64
+
+    Inst_VOP1__V_CVT_I32_F64::~Inst_VOP1__V_CVT_I32_F64()
+    {
+    } // ~Inst_VOP1__V_CVT_I32_F64
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F64_I32 class methods ---
+
+    Inst_VOP1__V_CVT_F64_I32::Inst_VOP1__V_CVT_F64_I32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f64_i32")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F64_I32
+
+    Inst_VOP1__V_CVT_F64_I32::~Inst_VOP1__V_CVT_F64_I32()
+    {
+    } // ~Inst_VOP1__V_CVT_F64_I32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.i.
+    void
+    Inst_VOP1__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_I32 class methods ---
+
+    Inst_VOP1__V_CVT_F32_I32::Inst_VOP1__V_CVT_F32_I32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_i32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_I32
+
+    Inst_VOP1__V_CVT_F32_I32::~Inst_VOP1__V_CVT_F32_I32()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_I32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.i.
+    void
+    Inst_VOP1__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_U32 class methods ---
+
+    Inst_VOP1__V_CVT_F32_U32::Inst_VOP1__V_CVT_F32_U32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_u32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_U32
+
+    Inst_VOP1__V_CVT_F32_U32::~Inst_VOP1__V_CVT_F32_U32()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_U32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.u.
+    void
+    Inst_VOP1__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_U32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_U32_F32::Inst_VOP1__V_CVT_U32_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_u32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_U32_F32
+
+    Inst_VOP1__V_CVT_U32_F32::~Inst_VOP1__V_CVT_U32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_U32_F32
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_I32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_I32_F32::Inst_VOP1__V_CVT_I32_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_I32_F32
+
+    Inst_VOP1__V_CVT_I32_F32::~Inst_VOP1__V_CVT_I32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_MOV_FED_B32 class methods ---
+
+    Inst_VOP1__V_MOV_FED_B32::Inst_VOP1__V_MOV_FED_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_mov_fed_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_MOV_FED_B32
+
+    Inst_VOP1__V_MOV_FED_B32::~Inst_VOP1__V_MOV_FED_B32()
+    {
+    } // ~Inst_VOP1__V_MOV_FED_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u;
+    // Introduce EDC double error upon write to dest vgpr without causing an
+    // ---  exception.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP1__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F16_F32 class methods ---
+
+    Inst_VOP1__V_CVT_F16_F32::Inst_VOP1__V_CVT_F16_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f16_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F16_F32
+
+    Inst_VOP1__V_CVT_F16_F32::~Inst_VOP1__V_CVT_F16_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_F16_F32
+
+    // --- description from .arch file ---
+    // D.f16 = flt32_to_flt16(S0.f).
+    // Supports input modifiers and creates FP16 denormals when appropriate.
+    void
+    Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                float tmp = src[lane];
+                AMDGPU::mxfloat16 out(tmp);
+
+                vdst[lane] = (out.data >> 16);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_F16 class methods ---
+
+    Inst_VOP1__V_CVT_F32_F16::Inst_VOP1__V_CVT_F32_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_f16")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_F16
+
+    Inst_VOP1__V_CVT_F32_F16::~Inst_VOP1__V_CVT_F32_F16()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_F16
+
+    // --- description from .arch file ---
+    // D.f = flt16_to_flt32(S0.f16).
+    // FP16 denormal inputs are always accepted.
+    void
+    Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                AMDGPU::mxfloat16 tmp(src[lane]);
+                vdst[lane] = float(tmp);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_RPI_I32_F32::Inst_VOP1__V_CVT_RPI_I32_F32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_rpi_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_RPI_I32_F32
+
+    Inst_VOP1__V_CVT_RPI_I32_F32::~Inst_VOP1__V_CVT_RPI_I32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_RPI_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f + 0.5).
+    void
+    Inst_VOP1__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_FLR_I32_F32 class methods ---
+
+    Inst_VOP1__V_CVT_FLR_I32_F32::Inst_VOP1__V_CVT_FLR_I32_F32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_flr_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_FLR_I32_F32
+
+    Inst_VOP1__V_CVT_FLR_I32_F32::~Inst_VOP1__V_CVT_FLR_I32_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_FLR_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f).
+    void
+    Inst_VOP1__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_OFF_F32_I4 class methods ---
+
+    Inst_VOP1__V_CVT_OFF_F32_I4::Inst_VOP1__V_CVT_OFF_F32_I4(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_off_f32_i4")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_OFF_F32_I4
+
+    Inst_VOP1__V_CVT_OFF_F32_I4::~Inst_VOP1__V_CVT_OFF_F32_I4()
+    {
+    } // ~Inst_VOP1__V_CVT_OFF_F32_I4
+
+    // --- description from .arch file ---
+    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
+    void
+    Inst_VOP1__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        // Could not parse sq_uc.arch desc field
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_F64 class methods ---
+
+    Inst_VOP1__V_CVT_F32_F64::Inst_VOP1__V_CVT_F32_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F32_F64
+
+    Inst_VOP1__V_CVT_F32_F64::~Inst_VOP1__V_CVT_F32_F64()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_F64
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.d.
+    void
+    Inst_VOP1__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F64_F32 class methods ---
+
+    Inst_VOP1__V_CVT_F64_F32::Inst_VOP1__V_CVT_F64_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f64_f32")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F64_F32
+
+    Inst_VOP1__V_CVT_F64_F32::~Inst_VOP1__V_CVT_F64_F32()
+    {
+    } // ~Inst_VOP1__V_CVT_F64_F32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.f.
+    void
+    Inst_VOP1__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE0 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE0::Inst_VOP1__V_CVT_F32_UBYTE0(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte0")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE0
+
+    Inst_VOP1__V_CVT_F32_UBYTE0::~Inst_VOP1__V_CVT_F32_UBYTE0()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE0
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[7:0]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE1 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE1::Inst_VOP1__V_CVT_F32_UBYTE1(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte1")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE1
+
+    Inst_VOP1__V_CVT_F32_UBYTE1::~Inst_VOP1__V_CVT_F32_UBYTE1()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE1
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[15:8]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE2 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE2::Inst_VOP1__V_CVT_F32_UBYTE2(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte2")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE2
+
+    Inst_VOP1__V_CVT_F32_UBYTE2::~Inst_VOP1__V_CVT_F32_UBYTE2()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE2
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[23:16]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F32_UBYTE3 class methods ---
+
+    Inst_VOP1__V_CVT_F32_UBYTE3::Inst_VOP1__V_CVT_F32_UBYTE3(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f32_ubyte3")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CVT_F32_UBYTE3
+
+    Inst_VOP1__V_CVT_F32_UBYTE3::~Inst_VOP1__V_CVT_F32_UBYTE3()
+    {
+    } // ~Inst_VOP1__V_CVT_F32_UBYTE3
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[31:24]).
+    void
+    Inst_VOP1__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_U32_F64 class methods ---
+
+    Inst_VOP1__V_CVT_U32_F64::Inst_VOP1__V_CVT_U32_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_u32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_U32_F64
+
+    Inst_VOP1__V_CVT_U32_F64::~Inst_VOP1__V_CVT_U32_F64()
+    {
+    } // ~Inst_VOP1__V_CVT_U32_F64
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP1__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F64_U32 class methods ---
+
+    Inst_VOP1__V_CVT_F64_U32::Inst_VOP1__V_CVT_F64_U32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f64_u32")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CVT_F64_U32
+
+    Inst_VOP1__V_CVT_F64_U32::~Inst_VOP1__V_CVT_F64_U32()
+    {
+    } // ~Inst_VOP1__V_CVT_F64_U32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.u.
+    void
+    Inst_VOP1__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_TRUNC_F64 class methods ---
+
+    Inst_VOP1__V_TRUNC_F64::Inst_VOP1__V_TRUNC_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_trunc_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_TRUNC_F64
+
+    Inst_VOP1__V_TRUNC_F64::~Inst_VOP1__V_TRUNC_F64()
+    {
+    } // ~Inst_VOP1__V_TRUNC_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d), return integer part of S0.d.
+    void
+    Inst_VOP1__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CEIL_F64 class methods ---
+
+    Inst_VOP1__V_CEIL_F64::Inst_VOP1__V_CEIL_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ceil_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_CEIL_F64
+
+    Inst_VOP1__V_CEIL_F64::~Inst_VOP1__V_CEIL_F64()
+    {
+    } // ~Inst_VOP1__V_CEIL_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
+    void
+    Inst_VOP1__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RNDNE_F64 class methods ---
+
+    Inst_VOP1__V_RNDNE_F64::Inst_VOP1__V_RNDNE_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rndne_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_RNDNE_F64
+
+    Inst_VOP1__V_RNDNE_F64::~Inst_VOP1__V_RNDNE_F64()
+    {
+    } // ~Inst_VOP1__V_RNDNE_F64
+
+    // --- description from .arch file ---
+    // D.d = round_nearest_even(S0.d).
+    void
+    Inst_VOP1__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FLOOR_F64 class methods ---
+
+    Inst_VOP1__V_FLOOR_F64::Inst_VOP1__V_FLOOR_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_floor_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FLOOR_F64
+
+    Inst_VOP1__V_FLOOR_F64::~Inst_VOP1__V_FLOOR_F64()
+    {
+    } // ~Inst_VOP1__V_FLOOR_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
+    void
+    Inst_VOP1__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FRACT_F32 class methods ---
+
+    Inst_VOP1__V_FRACT_F32::Inst_VOP1__V_FRACT_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_fract_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FRACT_F32
+
+    Inst_VOP1__V_FRACT_F32::~Inst_VOP1__V_FRACT_F32()
+    {
+    } // ~Inst_VOP1__V_FRACT_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - floor(S0.f).
+    void
+    Inst_VOP1__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_TRUNC_F32 class methods ---
+
+    Inst_VOP1__V_TRUNC_F32::Inst_VOP1__V_TRUNC_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_trunc_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_TRUNC_F32
+
+    Inst_VOP1__V_TRUNC_F32::~Inst_VOP1__V_TRUNC_F32()
+    {
+    } // ~Inst_VOP1__V_TRUNC_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f), return integer part of S0.f.
+    void
+    Inst_VOP1__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst (gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CEIL_F32 class methods ---
+
+    Inst_VOP1__V_CEIL_F32::Inst_VOP1__V_CEIL_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ceil_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_CEIL_F32
+
+    Inst_VOP1__V_CEIL_F32::~Inst_VOP1__V_CEIL_F32()
+    {
+    } // ~Inst_VOP1__V_CEIL_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
+    void
+    Inst_VOP1__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RNDNE_F32 class methods ---
+
+    Inst_VOP1__V_RNDNE_F32::Inst_VOP1__V_RNDNE_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rndne_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RNDNE_F32
+
+    Inst_VOP1__V_RNDNE_F32::~Inst_VOP1__V_RNDNE_F32()
+    {
+    } // ~Inst_VOP1__V_RNDNE_F32
+
+    // --- description from .arch file ---
+    // D.f = round_nearest_even(S0.f).
+    void
+    Inst_VOP1__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FLOOR_F32 class methods ---
+
+    Inst_VOP1__V_FLOOR_F32::Inst_VOP1__V_FLOOR_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_floor_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FLOOR_F32
+
+    Inst_VOP1__V_FLOOR_F32::~Inst_VOP1__V_FLOOR_F32()
+    {
+    } // ~Inst_VOP1__V_FLOOR_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
+    void
+    Inst_VOP1__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_EXP_F32 class methods ---
+
+    Inst_VOP1__V_EXP_F32::Inst_VOP1__V_EXP_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_exp_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_EXP_F32
+
+    Inst_VOP1__V_EXP_F32::~Inst_VOP1__V_EXP_F32()
+    {
+    } // ~Inst_VOP1__V_EXP_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f).
+    void
+    Inst_VOP1__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_LOG_F32 class methods ---
+
+    Inst_VOP1__V_LOG_F32::Inst_VOP1__V_LOG_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_log_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_LOG_F32
+
+    Inst_VOP1__V_LOG_F32::~Inst_VOP1__V_LOG_F32()
+    {
+    } // ~Inst_VOP1__V_LOG_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm.
+    void
+    Inst_VOP1__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RCP_F32 class methods ---
+
+    Inst_VOP1__V_RCP_F32::Inst_VOP1__V_RCP_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RCP_F32
+
+    Inst_VOP1__V_RCP_F32::~Inst_VOP1__V_RCP_F32()
+    {
+    } // ~Inst_VOP1__V_RCP_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
+    void
+    Inst_VOP1__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RCP_IFLAG_F32 class methods ---
+
+    Inst_VOP1__V_RCP_IFLAG_F32::Inst_VOP1__V_RCP_IFLAG_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_iflag_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RCP_IFLAG_F32
+
+    Inst_VOP1__V_RCP_IFLAG_F32::~Inst_VOP1__V_RCP_IFLAG_F32()
+    {
+    } // ~Inst_VOP1__V_RCP_IFLAG_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
+    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
+    // ---  exceptions.
+    void
+    Inst_VOP1__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RSQ_F32 class methods ---
+
+    Inst_VOP1__V_RSQ_F32::Inst_VOP1__V_RSQ_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rsq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_RSQ_F32
+
+    Inst_VOP1__V_RSQ_F32::~Inst_VOP1__V_RSQ_F32()
+    {
+    } // ~Inst_VOP1__V_RSQ_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
+    void
+    Inst_VOP1__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RCP_F64 class methods ---
+
+    Inst_VOP1__V_RCP_F64::Inst_VOP1__V_RCP_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_RCP_F64
+
+    Inst_VOP1__V_RCP_F64::~Inst_VOP1__V_RCP_F64()
+    {
+    } // ~Inst_VOP1__V_RCP_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / S0.d.
+    void
+    Inst_VOP1__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = -0.0;
+                    } else {
+                        vdst[lane] = 0.0;
+                    }
+                } else {
+                    vdst[lane] = 1.0 / src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_RSQ_F64 class methods ---
+
+    Inst_VOP1__V_RSQ_F64::Inst_VOP1__V_RSQ_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rsq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_RSQ_F64
+
+    Inst_VOP1__V_RSQ_F64::~Inst_VOP1__V_RSQ_F64()
+    {
+    } // ~Inst_VOP1__V_RSQ_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
+    void
+    Inst_VOP1__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane])
+                           && !std::signbit(src[lane])) {
+                    vdst[lane] = 0.0;
+                } else if (std::signbit(src[lane])) {
+                    vdst[lane] = NAN;
+                } else {
+                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_SQRT_F32 class methods ---
+
+    Inst_VOP1__V_SQRT_F32::Inst_VOP1__V_SQRT_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sqrt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_SQRT_F32
+
+    Inst_VOP1__V_SQRT_F32::~Inst_VOP1__V_SQRT_F32()
+    {
+    } // ~Inst_VOP1__V_SQRT_F32
+
+    // --- description from .arch file ---
+    // D.f = sqrt(S0.f).
+    void
+    Inst_VOP1__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_SQRT_F64 class methods ---
+
+    Inst_VOP1__V_SQRT_F64::Inst_VOP1__V_SQRT_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sqrt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_SQRT_F64
+
+    Inst_VOP1__V_SQRT_F64::~Inst_VOP1__V_SQRT_F64()
+    {
+    } // ~Inst_VOP1__V_SQRT_F64
+
+    // --- description from .arch file ---
+    // D.d = sqrt(S0.d).
+    void
+    Inst_VOP1__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_SIN_F32 class methods ---
+
+    Inst_VOP1__V_SIN_F32::Inst_VOP1__V_SIN_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sin_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_SIN_F32
+
+    Inst_VOP1__V_SIN_F32::~Inst_VOP1__V_SIN_F32()
+    {
+    } // ~Inst_VOP1__V_SIN_F32
+
+    // --- description from .arch file ---
+    // D.f = sin(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 0.0.
+    void
+    Inst_VOP1__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (src[lane] < -256.0 || src[lane] > 256.0) {
+                    vdst[lane] = 0.0;
+                } else {
+                    vdst[lane] = std::sin(src[lane] * 2.0 * pi.rawData());
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_COS_F32 class methods ---
+
+    Inst_VOP1__V_COS_F32::Inst_VOP1__V_COS_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cos_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_COS_F32
+
+    Inst_VOP1__V_COS_F32::~Inst_VOP1__V_COS_F32()
+    {
+    } // ~Inst_VOP1__V_COS_F32
+
+    // --- description from .arch file ---
+    // D.f = cos(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 1.0.
+    void
+    Inst_VOP1__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (src[lane] < -256.0 || src[lane] > 256.0) {
+                    vdst[lane] = 0.0;
+                } else {
+                    vdst[lane] = std::cos(src[lane] * 2.0 * pi.rawData());
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_NOT_B32 class methods ---
+
+    Inst_VOP1__V_NOT_B32::Inst_VOP1__V_NOT_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_not_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_NOT_B32
+
+    Inst_VOP1__V_NOT_B32::~Inst_VOP1__V_NOT_B32()
+    {
+    } // ~Inst_VOP1__V_NOT_B32
+
+    // --- description from .arch file ---
+    // D.u = ~S0.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP1__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ~src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_BFREV_B32 class methods ---
+
+    Inst_VOP1__V_BFREV_B32::Inst_VOP1__V_BFREV_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_bfrev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_BFREV_B32
+
+    Inst_VOP1__V_BFREV_B32::~Inst_VOP1__V_BFREV_B32()
+    {
+    } // ~Inst_VOP1__V_BFREV_B32
+
+    // --- description from .arch file ---
+    // D.u[31:0] = S0.u[0:31], bitfield reverse.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP1__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = reverseBits(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FFBH_U32 class methods ---
+
+    Inst_VOP1__V_FFBH_U32::Inst_VOP1__V_FFBH_U32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ffbh_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_FFBH_U32
+
+    Inst_VOP1__V_FFBH_U32::~Inst_VOP1__V_FFBH_U32()
+    {
+    } // ~Inst_VOP1__V_FFBH_U32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from MSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP1__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOneMsb(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FFBL_B32 class methods ---
+
+    Inst_VOP1__V_FFBL_B32::Inst_VOP1__V_FFBL_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ffbl_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_FFBL_B32
+
+    Inst_VOP1__V_FFBL_B32::~Inst_VOP1__V_FFBL_B32()
+    {
+    } // ~Inst_VOP1__V_FFBL_B32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from LSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP1__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOne(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FFBH_I32 class methods ---
+
+    Inst_VOP1__V_FFBH_I32::Inst_VOP1__V_FFBH_I32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ffbh_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_FFBH_I32
+
+    Inst_VOP1__V_FFBH_I32::~Inst_VOP1__V_FFBH_I32()
+    {
+    } // ~Inst_VOP1__V_FFBH_I32
+
+    // --- description from .arch file ---
+    // D.u = position of first bit different from sign bit in S0.i from MSB;
+    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
+    void
+    Inst_VOP1__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = firstOppositeSignBit(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_EXP_I32_F64 class methods ---
+
+    Inst_VOP1__V_FREXP_EXP_I32_F64::Inst_VOP1__V_FREXP_EXP_I32_F64(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FREXP_EXP_I32_F64
+
+    Inst_VOP1__V_FREXP_EXP_I32_F64::~Inst_VOP1__V_FREXP_EXP_I32_F64()
+    {
+    } // ~Inst_VOP1__V_FREXP_EXP_I32_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_EXP_I32_F32.
+    void
+    Inst_VOP1__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp = 0;
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_MANT_F64 class methods ---
+
+    Inst_VOP1__V_FREXP_MANT_F64::Inst_VOP1__V_FREXP_MANT_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_mant_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FREXP_MANT_F64
+
+    Inst_VOP1__V_FREXP_MANT_F64::~Inst_VOP1__V_FREXP_MANT_F64()
+    {
+    } // ~Inst_VOP1__V_FREXP_MANT_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_MANT_F32.
+    void
+    Inst_VOP1__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = src[lane];
+                } else {
+                    VecElemI32 exp(0);
+                    vdst[lane] = std::frexp(src[lane], &exp);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FRACT_F64 class methods ---
+
+    Inst_VOP1__V_FRACT_F64::Inst_VOP1__V_FRACT_F64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_fract_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP1__V_FRACT_F64
+
+    Inst_VOP1__V_FRACT_F64::~Inst_VOP1__V_FRACT_F64()
+    {
+    } // ~Inst_VOP1__V_FRACT_F64
+
+    // --- description from .arch file ---
+    // See V_FRACT_F32.
+    void
+    Inst_VOP1__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF64 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_EXP_I32_F32 class methods ---
+
+    Inst_VOP1__V_FREXP_EXP_I32_F32::Inst_VOP1__V_FREXP_EXP_I32_F32(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_exp_i32_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FREXP_EXP_I32_F32
+
+    Inst_VOP1__V_FREXP_EXP_I32_F32::~Inst_VOP1__V_FREXP_EXP_I32_F32()
+    {
+    } // ~Inst_VOP1__V_FREXP_EXP_I32_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.i = 0;
+    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
+    // Returns exponent of single precision float input, such that S0.f =
+    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
+    // the significand.
+    void
+    Inst_VOP1__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp(0);
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_MANT_F32 class methods ---
+
+    Inst_VOP1__V_FREXP_MANT_F32::Inst_VOP1__V_FREXP_MANT_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_mant_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_FREXP_MANT_F32
+
+    Inst_VOP1__V_FREXP_MANT_F32::~Inst_VOP1__V_FREXP_MANT_F32()
+    {
+    } // ~Inst_VOP1__V_FREXP_MANT_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
+    // else D.f = Mantissa(S0.f).
+    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
+    // ---  significand of single precision float input, such that S0.f =
+    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
+    // ---  returns integer exponent.
+    void
+    Inst_VOP1__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = src[lane];
+                } else {
+                    VecElemI32 exp(0);
+                    vdst[lane] = std::frexp(src[lane], &exp);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CLREXCP class methods ---
+
+    Inst_VOP1__V_CLREXCP::Inst_VOP1__V_CLREXCP(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_clrexcp")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_CLREXCP
+
+    Inst_VOP1__V_CLREXCP::~Inst_VOP1__V_CLREXCP()
+    {
+    } // ~Inst_VOP1__V_CLREXCP
+
+    // --- description from .arch file ---
+    // Clear wave's exception state in SIMD (SP).
+    void
+    Inst_VOP1__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_MOV_B64 class methods ---
+
+    Inst_VOP1__V_MOV_B64::Inst_VOP1__V_MOV_B64(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_mov_b64")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_MOV_B64
+
+    Inst_VOP1__V_MOV_B64::~Inst_VOP1__V_MOV_B64()
+    {
+    } // ~Inst_VOP1__V_MOV_B64
+
+    // --- description from .arch file ---
+    // D.u = S0.u.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP1__V_MOV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src(gpuDynInst, instData.SRC0);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        panic_if(isDPPInst(), "DPP unimplemented for v_mov_b64");
+        panic_if(isSDWAInst(), "SDWA unimplemented for v_mov_b64");
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F16_U16 class methods ---
+
+    Inst_VOP1__V_CVT_F16_U16::Inst_VOP1__V_CVT_F16_U16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f16_u16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_F16_U16
+
+    Inst_VOP1__V_CVT_F16_U16::~Inst_VOP1__V_CVT_F16_U16()
+    {
+    } // ~Inst_VOP1__V_CVT_F16_U16
+
+    // --- description from .arch file ---
+    // D.f16 = uint16_to_flt16(S.u16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_F16_I16 class methods ---
+
+    Inst_VOP1__V_CVT_F16_I16::Inst_VOP1__V_CVT_F16_I16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_f16_i16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_F16_I16
+
+    Inst_VOP1__V_CVT_F16_I16::~Inst_VOP1__V_CVT_F16_I16()
+    {
+    } // ~Inst_VOP1__V_CVT_F16_I16
+
+    // --- description from .arch file ---
+    // D.f16 = int16_to_flt16(S.i16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_U16_F16 class methods ---
+
+    Inst_VOP1__V_CVT_U16_F16::Inst_VOP1__V_CVT_U16_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_u16_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_U16_F16
+
+    Inst_VOP1__V_CVT_U16_F16::~Inst_VOP1__V_CVT_U16_F16()
+    {
+    } // ~Inst_VOP1__V_CVT_U16_F16
+
+    // --- description from .arch file ---
+    // D.u16 = flt16_to_uint16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CVT_I16_F16 class methods ---
+
+    Inst_VOP1__V_CVT_I16_F16::Inst_VOP1__V_CVT_I16_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cvt_i16_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CVT_I16_F16
+
+    Inst_VOP1__V_CVT_I16_F16::~Inst_VOP1__V_CVT_I16_F16()
+    {
+    } // ~Inst_VOP1__V_CVT_I16_F16
+
+    // --- description from .arch file ---
+    // D.i16 = flt16_to_int16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP1__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_RCP_F16 class methods ---
+
+    Inst_VOP1__V_RCP_F16::Inst_VOP1__V_RCP_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rcp_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_RCP_F16
+
+    Inst_VOP1__V_RCP_F16::~Inst_VOP1__V_RCP_F16()
+    {
+    } // ~Inst_VOP1__V_RCP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecip(S0.f16).
+    void
+    Inst_VOP1__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_SQRT_F16 class methods ---
+
+    Inst_VOP1__V_SQRT_F16::Inst_VOP1__V_SQRT_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sqrt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_SQRT_F16
+
+    Inst_VOP1__V_SQRT_F16::~Inst_VOP1__V_SQRT_F16()
+    {
+    } // ~Inst_VOP1__V_SQRT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateSqrt(S0.f16).
+    void
+    Inst_VOP1__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_RSQ_F16 class methods ---
+
+    Inst_VOP1__V_RSQ_F16::Inst_VOP1__V_RSQ_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rsq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_RSQ_F16
+
+    Inst_VOP1__V_RSQ_F16::~Inst_VOP1__V_RSQ_F16()
+    {
+    } // ~Inst_VOP1__V_RSQ_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecipSqrt(S0.f16).
+    void
+    Inst_VOP1__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_LOG_F16 class methods ---
+
+    Inst_VOP1__V_LOG_F16::Inst_VOP1__V_LOG_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_log_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_LOG_F16
+
+    Inst_VOP1__V_LOG_F16::~Inst_VOP1__V_LOG_F16()
+    {
+    } // ~Inst_VOP1__V_LOG_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 0.0f;
+    // else
+    //     D.f16 = ApproximateLog2(S0.f16).
+    void
+    Inst_VOP1__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_EXP_F16 class methods ---
+
+    Inst_VOP1__V_EXP_F16::Inst_VOP1__V_EXP_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_exp_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_EXP_F16
+
+    Inst_VOP1__V_EXP_F16::~Inst_VOP1__V_EXP_F16()
+    {
+    } // ~Inst_VOP1__V_EXP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 0.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = Approximate2ToX(S0.f16).
+    void
+    Inst_VOP1__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_MANT_F16 class methods ---
+
+    Inst_VOP1__V_FREXP_MANT_F16::Inst_VOP1__V_FREXP_MANT_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_mant_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FREXP_MANT_F16
+
+    Inst_VOP1__V_FREXP_MANT_F16::~Inst_VOP1__V_FREXP_MANT_F16()
+    {
+    } // ~Inst_VOP1__V_FREXP_MANT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.f16 = S0.f16;
+    // else
+    //     D.f16 = mantissa(S0.f16).
+    // Result range is (-1.0,-0.5][0.5,1.0).
+    // C math library frexp function.
+    // Returns binary significand of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP1__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FREXP_EXP_I16_F16 class methods ---
+
+    Inst_VOP1__V_FREXP_EXP_I16_F16::Inst_VOP1__V_FREXP_EXP_I16_F16(
+          InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_frexp_exp_i16_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FREXP_EXP_I16_F16
+
+    Inst_VOP1__V_FREXP_EXP_I16_F16::~Inst_VOP1__V_FREXP_EXP_I16_F16()
+    {
+    } // ~Inst_VOP1__V_FREXP_EXP_I16_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.i16 = 0;
+    // else
+    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
+    // C math library frexp function.
+    // Returns exponent of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP1__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FLOOR_F16 class methods ---
+
+    Inst_VOP1__V_FLOOR_F16::Inst_VOP1__V_FLOOR_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_floor_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FLOOR_F16
+
+    Inst_VOP1__V_FLOOR_F16::~Inst_VOP1__V_FLOOR_F16()
+    {
+    } // ~Inst_VOP1__V_FLOOR_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
+    void
+    Inst_VOP1__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_CEIL_F16 class methods ---
+
+    Inst_VOP1__V_CEIL_F16::Inst_VOP1__V_CEIL_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_ceil_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_CEIL_F16
+
+    Inst_VOP1__V_CEIL_F16::~Inst_VOP1__V_CEIL_F16()
+    {
+    } // ~Inst_VOP1__V_CEIL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
+    void
+    Inst_VOP1__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_TRUNC_F16 class methods ---
+
+    Inst_VOP1__V_TRUNC_F16::Inst_VOP1__V_TRUNC_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_trunc_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_TRUNC_F16
+
+    Inst_VOP1__V_TRUNC_F16::~Inst_VOP1__V_TRUNC_F16()
+    {
+    } // ~Inst_VOP1__V_TRUNC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16).
+    // Round-to-zero semantics.
+    void
+    Inst_VOP1__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_RNDNE_F16 class methods ---
+
+    Inst_VOP1__V_RNDNE_F16::Inst_VOP1__V_RNDNE_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_rndne_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_RNDNE_F16
+
+    Inst_VOP1__V_RNDNE_F16::~Inst_VOP1__V_RNDNE_F16()
+    {
+    } // ~Inst_VOP1__V_RNDNE_F16
+
+    // --- description from .arch file ---
+    // D.f16 = FLOOR(S0.f16 + 0.5f);
+    // if (floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
+    // Round-to-nearest-even semantics.
+    void
+    Inst_VOP1__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_FRACT_F16 class methods ---
+
+    Inst_VOP1__V_FRACT_F16::Inst_VOP1__V_FRACT_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_fract_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_FRACT_F16
+
+    Inst_VOP1__V_FRACT_F16::~Inst_VOP1__V_FRACT_F16()
+    {
+    } // ~Inst_VOP1__V_FRACT_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + -floor(S0.f16).
+    void
+    Inst_VOP1__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_SIN_F16 class methods ---
+
+    Inst_VOP1__V_SIN_F16::Inst_VOP1__V_SIN_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_sin_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_SIN_F16
+
+    Inst_VOP1__V_SIN_F16::~Inst_VOP1__V_SIN_F16()
+    {
+    } // ~Inst_VOP1__V_SIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = sin(S0.f16 * 2 * PI).
+    void
+    Inst_VOP1__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_COS_F16 class methods ---
+
+    Inst_VOP1__V_COS_F16::Inst_VOP1__V_COS_F16(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_cos_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP1__V_COS_F16
+
+    Inst_VOP1__V_COS_F16::~Inst_VOP1__V_COS_F16()
+    {
+    } // ~Inst_VOP1__V_COS_F16
+
+    // --- description from .arch file ---
+    // D.f16 = cos(S0.f16 * 2 * PI).
+    void
+    Inst_VOP1__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP1__V_EXP_LEGACY_F32 class methods ---
+
+    Inst_VOP1__V_EXP_LEGACY_F32::Inst_VOP1__V_EXP_LEGACY_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_exp_legacy_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_EXP_LEGACY_F32
+
+    Inst_VOP1__V_EXP_LEGACY_F32::~Inst_VOP1__V_EXP_LEGACY_F32()
+    {
+    } // ~Inst_VOP1__V_EXP_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f) with legacy semantics.
+    void
+    Inst_VOP1__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_LOG_LEGACY_F32 class methods ---
+
+    Inst_VOP1__V_LOG_LEGACY_F32::Inst_VOP1__V_LOG_LEGACY_F32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_log_legacy_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP1__V_LOG_LEGACY_F32
+
+    Inst_VOP1__V_LOG_LEGACY_F32::~Inst_VOP1__V_LOG_LEGACY_F32()
+    {
+    } // ~Inst_VOP1__V_LOG_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
+    void
+    Inst_VOP1__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP1__V_ACCVGPR_MOV_B32 class methods ---
+
+    Inst_VOP1__V_ACCVGPR_MOV_B32::
+        Inst_VOP1__V_ACCVGPR_MOV_B32(InFmt_VOP1 *iFmt)
+        : Inst_VOP1(iFmt, "v_accvgpr_mov_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP1__V_ACCVGPR_MOV_B32
+
+    Inst_VOP1__V_ACCVGPR_MOV_B32::~Inst_VOP1__V_ACCVGPR_MOV_B32()
+    {
+    } // ~Inst_VOP1__V_ACCVGPR_MOV_B32
+
+    void
+    Inst_VOP1__V_ACCVGPR_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        unsigned accum_offset = wf->accumOffset;
+
+        ConstVecOperandU32 src(gpuDynInst, instData.SRC0+accum_offset);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop2.cc b/src/arch/amdgpu/vega/insts/vop2.cc
new file mode 100644
index 0000000000..55146711b6
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop2.cc
@@ -0,0 +1,2221 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "debug/VEGA.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP2__V_CNDMASK_B32 class methods ---
+
+    Inst_VOP2__V_CNDMASK_B32::Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_cndmask_b32")
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_CNDMASK_B32
+
+    Inst_VOP2__V_CNDMASK_B32::~Inst_VOP2__V_CNDMASK_B32()
+    {
+    } // ~Inst_VOP2__V_CNDMASK_B32
+
+    // --- description from .arch file ---
+    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
+    // as a scalar GPR in S2.
+    void
+    Inst_VOP2__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ADD_F32 class methods ---
+
+    Inst_VOP2__V_ADD_F32::Inst_VOP2__V_ADD_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_ADD_F32
+
+    Inst_VOP2__V_ADD_F32::~Inst_VOP2__V_ADD_F32()
+    {
+    } // ~Inst_VOP2__V_ADD_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f + S1.f.
+    void
+    Inst_VOP2__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isDPPInst()) {
+            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src0_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_dpp[lane] + src1[lane];
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_F32 class methods ---
+
+    Inst_VOP2__V_SUB_F32::Inst_VOP2__V_SUB_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_SUB_F32
+
+    Inst_VOP2__V_SUB_F32::~Inst_VOP2__V_SUB_F32()
+    {
+    } // ~Inst_VOP2__V_SUB_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - S1.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP2__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_F32 class methods ---
+
+    Inst_VOP2__V_SUBREV_F32::Inst_VOP2__V_SUBREV_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_SUBREV_F32
+
+    Inst_VOP2__V_SUBREV_F32::~Inst_VOP2__V_SUBREV_F32()
+    {
+    } // ~Inst_VOP2__V_SUBREV_F32
+
+    // --- description from .arch file ---
+    // D.f = S1.f - S0.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP2__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_LEGACY_F32 class methods ---
+
+    Inst_VOP2__V_MUL_LEGACY_F32::Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_legacy_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MUL_LEGACY_F32
+
+    Inst_VOP2__V_MUL_LEGACY_F32::~Inst_VOP2__V_MUL_LEGACY_F32()
+    {
+    } // ~Inst_VOP2__V_MUL_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
+    void
+    Inst_VOP2__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_F32 class methods ---
+
+    Inst_VOP2__V_MUL_F32::Inst_VOP2__V_MUL_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MUL_F32
+
+    Inst_VOP2__V_MUL_F32::~Inst_VOP2__V_MUL_F32()
+    {
+    } // ~Inst_VOP2__V_MUL_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f.
+    void
+    Inst_VOP2__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_I32_I24 class methods ---
+
+    Inst_VOP2__V_MUL_I32_I24::Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_i32_i24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_I32_I24
+
+    Inst_VOP2__V_MUL_I32_I24::~Inst_VOP2__V_MUL_I32_I24()
+    {
+    } // ~Inst_VOP2__V_MUL_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = S0.i[23:0] * S1.i[23:0].
+    void
+    Inst_VOP2__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
+                    * sext<24>(bits(src1[lane], 23, 0));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_HI_I32_I24 class methods ---
+
+    Inst_VOP2__V_MUL_HI_I32_I24::Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_HI_I32_I24
+
+    Inst_VOP2__V_MUL_HI_I32_I24::~Inst_VOP2__V_MUL_HI_I32_I24()
+    {
+    } // ~Inst_VOP2__V_MUL_HI_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
+    void
+    Inst_VOP2__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 tmp_src0
+                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
+                VecElemI64 tmp_src1
+                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
+
+                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_U32_U24 class methods ---
+
+    Inst_VOP2__V_MUL_U32_U24::Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_u32_u24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_U32_U24
+
+    Inst_VOP2__V_MUL_U32_U24::~Inst_VOP2__V_MUL_U32_U24()
+    {
+    } // ~Inst_VOP2__V_MUL_U32_U24
+
+    // --- description from .arch file ---
+    // D.u = S0.u[23:0] * S1.u[23:0].
+    void
+    Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,
+                         VecOperandU32& vdst, Wavefront* wf) {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = bits(src0[lane], 23, 0) *
+                                 bits(src1[lane], 23, 0);
+                }
+            }
+        };
+
+        vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);
+    } // execute
+    // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---
+
+    Inst_VOP2__V_MUL_HI_U32_U24::Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_HI_U32_U24
+
+    Inst_VOP2__V_MUL_HI_U32_U24::~Inst_VOP2__V_MUL_HI_U32_U24()
+    {
+    } // ~Inst_VOP2__V_MUL_HI_U32_U24
+
+    // --- description from .arch file ---
+    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
+    void
+    Inst_VOP2__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
+                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
+                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_F32 class methods ---
+
+    Inst_VOP2__V_MIN_F32::Inst_VOP2__V_MIN_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MIN_F32
+
+    Inst_VOP2__V_MIN_F32::~Inst_VOP2__V_MIN_F32()
+    {
+    } // ~Inst_VOP2__V_MIN_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f < S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP2__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmin(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_F32 class methods ---
+
+    Inst_VOP2__V_MAX_F32::Inst_VOP2__V_MAX_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP2__V_MAX_F32
+
+    Inst_VOP2__V_MAX_F32::~Inst_VOP2__V_MAX_F32()
+    {
+    } // ~Inst_VOP2__V_MAX_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP2__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmax(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_I32 class methods ---
+
+    Inst_VOP2__V_MIN_I32::Inst_VOP2__V_MIN_I32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_I32
+
+    Inst_VOP2__V_MIN_I32::~Inst_VOP2__V_MIN_I32()
+    {
+    } // ~Inst_VOP2__V_MIN_I32
+
+    // --- description from .arch file ---
+    // D.i = min(S0.i, S1.i).
+    void
+    Inst_VOP2__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_I32 class methods ---
+
+    Inst_VOP2__V_MAX_I32::Inst_VOP2__V_MAX_I32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_I32
+
+    Inst_VOP2__V_MAX_I32::~Inst_VOP2__V_MAX_I32()
+    {
+    } // ~Inst_VOP2__V_MAX_I32
+
+    // --- description from .arch file ---
+    // D.i = max(S0.i, S1.i).
+    void
+    Inst_VOP2__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_U32 class methods ---
+
+    Inst_VOP2__V_MIN_U32::Inst_VOP2__V_MIN_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_U32
+
+    Inst_VOP2__V_MIN_U32::~Inst_VOP2__V_MIN_U32()
+    {
+    } // ~Inst_VOP2__V_MIN_U32
+
+    // --- description from .arch file ---
+    // D.u = min(S0.u, S1.u).
+    void
+    Inst_VOP2__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_U32 class methods ---
+
+    Inst_VOP2__V_MAX_U32::Inst_VOP2__V_MAX_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_U32
+
+    Inst_VOP2__V_MAX_U32::~Inst_VOP2__V_MAX_U32()
+    {
+    } // ~Inst_VOP2__V_MAX_U32
+
+    // --- description from .arch file ---
+    // D.u = max(S0.u, S1.u).
+    void
+    Inst_VOP2__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHRREV_B32 class methods ---
+
+    Inst_VOP2__V_LSHRREV_B32::Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshrrev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHRREV_B32
+
+    Inst_VOP2__V_LSHRREV_B32::~Inst_VOP2__V_LSHRREV_B32()
+    {
+    } // ~Inst_VOP2__V_LSHRREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u >> S0.u[4:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ASHRREV_I32 class methods ---
+
+    Inst_VOP2__V_ASHRREV_I32::Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_ashrrev_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ASHRREV_I32
+
+    Inst_VOP2__V_ASHRREV_I32::~Inst_VOP2__V_ASHRREV_I32()
+    {
+    } // ~Inst_VOP2__V_ASHRREV_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(S1.i) >> S0.i[4:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHLREV_B32 class methods ---
+
+    Inst_VOP2__V_LSHLREV_B32::Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshlrev_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHLREV_B32
+
+    Inst_VOP2__V_LSHLREV_B32::~Inst_VOP2__V_LSHLREV_B32()
+    {
+    } // ~Inst_VOP2__V_LSHLREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u << S0.u[4:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and vdst during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
+                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "
+                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
+                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_AND_B32 class methods ---
+
+    Inst_VOP2__V_AND_B32::Inst_VOP2__V_AND_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_and_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_AND_B32
+
+    Inst_VOP2__V_AND_B32::~Inst_VOP2__V_AND_B32()
+    {
+    } // ~Inst_VOP2__V_AND_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP2__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isDPPInst()) {
+            VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src0_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_AND_B32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_dpp[lane] & src1[lane];
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] & src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_OR_B32 class methods ---
+
+    Inst_VOP2__V_OR_B32::Inst_VOP2__V_OR_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_or_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_OR_B32
+
+    Inst_VOP2__V_OR_B32::~Inst_VOP2__V_OR_B32()
+    {
+    } // ~Inst_VOP2__V_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP2__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and dest during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
+                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
+                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
+                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] | src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] | src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_XOR_B32 class methods ---
+
+    Inst_VOP2__V_XOR_B32::Inst_VOP2__V_XOR_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_xor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_XOR_B32
+
+    Inst_VOP2__V_XOR_B32::~Inst_VOP2__V_XOR_B32()
+    {
+    } // ~Inst_VOP2__V_XOR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u ^ S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP2__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] ^ src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAC_F32 class methods ---
+
+    Inst_VOP2__V_MAC_F32::Inst_VOP2__V_MAC_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mac_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAC);
+    } // Inst_VOP2__V_MAC_F32
+
+    Inst_VOP2__V_MAC_F32::~Inst_VOP2__V_MAC_F32()
+    {
+    } // ~Inst_VOP2__V_MAC_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + D.f.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP2__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        VecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+        vdst.read();
+
+        if (isDPPInst()) {
+            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src0_dpp.read();
+
+            DPRINTF(VEGA, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "
+                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
+                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
+                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
+                    extData.iFmt_VOP_DPP.DPP_CTRL,
+                    extData.iFmt_VOP_DPP.SRC0_ABS,
+                    extData.iFmt_VOP_DPP.SRC0_NEG,
+                    extData.iFmt_VOP_DPP.SRC1_ABS,
+                    extData.iFmt_VOP_DPP.SRC1_NEG,
+                    extData.iFmt_VOP_DPP.BC,
+                    extData.iFmt_VOP_DPP.BANK_MASK,
+                    extData.iFmt_VOP_DPP.ROW_MASK);
+
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
+                                          vdst[lane]);
+                }
+            }
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MADMK_F32 class methods ---
+
+    Inst_VOP2__V_MADMK_F32::Inst_VOP2__V_MADMK_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madmk_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADMK_F32
+
+    Inst_VOP2__V_MADMK_F32::~Inst_VOP2__V_MADMK_F32()
+    {
+    } // ~Inst_VOP2__V_MADMK_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // ---  modifiers.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP2__V_MADMK_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+        VecElemF32 k = extData.imm_f32;
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], k, src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MADAK_F32 class methods ---
+
+    Inst_VOP2__V_MADAK_F32::Inst_VOP2__V_MADAK_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madak_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADAK_F32
+
+    Inst_VOP2__V_MADAK_F32::~Inst_VOP2__V_MADAK_F32()
+    {
+    } // ~Inst_VOP2__V_MADAK_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // ---  modifiers.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP2__V_MADAK_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+        VecElemF32 k = extData.imm_f32;
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], k);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ADD_CO_U32 class methods ---
+
+    Inst_VOP2__V_ADD_CO_U32::Inst_VOP2__V_ADD_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP2__V_ADD_CO_U32
+
+    Inst_VOP2__V_ADD_CO_U32::~Inst_VOP2__V_ADD_CO_U32()
+    {
+    } // ~Inst_VOP2__V_ADD_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
+    // ---  overflow or carry-out for V_ADDC_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP2__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and dest during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_ADD_CO_U32 SRC SDWA. SRC0: register "
+                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
+                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
+                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] + src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
+                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                    vcc.setBit(lane, ((VecElemU64)src0[lane]
+                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUB_CO_U32::Inst_VOP2__V_SUB_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP2__V_SUB_CO_U32
+
+    Inst_VOP2__V_SUB_CO_U32::~Inst_VOP2__V_SUB_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP2__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUBREV_CO_U32::Inst_VOP2__V_SUBREV_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP2__V_SUBREV_CO_U32
+
+    Inst_VOP2__V_SUBREV_CO_U32::~Inst_VOP2__V_SUBREV_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP2__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_ADDC_CO_U32 class methods ---
+
+    Inst_VOP2__V_ADDC_CO_U32::Inst_VOP2__V_ADDC_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_addc_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_ADDC_CO_U32
+
+    Inst_VOP2__V_ADDC_CO_U32::~Inst_VOP2__V_ADDC_CO_U32()
+    {
+    } // ~Inst_VOP2__V_ADDC_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + VCC[threadId];
+    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
+    // is an UNSIGNED overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP2__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane]
+                    + bits(vcc.rawData(), lane);
+                vcc.setBit(lane, ((VecElemU64)src0[lane]
+                    + (VecElemU64)src1[lane]
+                        + (VecElemU64)bits(vcc.rawData(), lane, lane))
+                            >= 0x100000000 ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBB_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUBB_CO_U32::Inst_VOP2__V_SUBB_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subb_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_SUBB_CO_U32
+
+    Inst_VOP2__V_SUBB_CO_U32::~Inst_VOP2__V_SUBB_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUBB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // ---  overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // ---  source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP2__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);
+                vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
+                    > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBBREV_CO_U32 class methods ---
+
+    Inst_VOP2__V_SUBBREV_CO_U32::Inst_VOP2__V_SUBBREV_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subbrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP2__V_SUBBREV_CO_U32
+
+    Inst_VOP2__V_SUBBREV_CO_U32::~Inst_VOP2__V_SUBBREV_CO_U32()
+    {
+    } // ~Inst_VOP2__V_SUBBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
+    // SQ translates this to V_SUBREV_U32 with reversed operands.
+    void
+    Inst_VOP2__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);
+                vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))
+                    > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP2__V_ADD_F16 class methods ---
+
+    Inst_VOP2__V_ADD_F16::Inst_VOP2__V_ADD_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_ADD_F16
+
+    Inst_VOP2__V_ADD_F16::~Inst_VOP2__V_ADD_F16()
+    {
+    } // ~Inst_VOP2__V_ADD_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP2__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_SUB_F16 class methods ---
+
+    Inst_VOP2__V_SUB_F16::Inst_VOP2__V_SUB_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_SUB_F16
+
+    Inst_VOP2__V_SUB_F16::~Inst_VOP2__V_SUB_F16()
+    {
+    } // ~Inst_VOP2__V_SUB_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 - S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP2__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_F16 class methods ---
+
+    Inst_VOP2__V_SUBREV_F16::Inst_VOP2__V_SUBREV_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_SUBREV_F16
+
+    Inst_VOP2__V_SUBREV_F16::~Inst_VOP2__V_SUBREV_F16()
+    {
+    } // ~Inst_VOP2__V_SUBREV_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S1.f16 - S0.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP2__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MUL_F16 class methods ---
+
+    Inst_VOP2__V_MUL_F16::Inst_VOP2__V_MUL_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_MUL_F16
+
+    Inst_VOP2__V_MUL_F16::~Inst_VOP2__V_MUL_F16()
+    {
+    } // ~Inst_VOP2__V_MUL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP2__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MAC_F16 class methods ---
+
+    Inst_VOP2__V_MAC_F16::Inst_VOP2__V_MAC_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mac_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAC);
+    } // Inst_VOP2__V_MAC_F16
+
+    Inst_VOP2__V_MAC_F16::~Inst_VOP2__V_MAC_F16()
+    {
+    } // ~Inst_VOP2__V_MAC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + D.f16.
+    // Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP2__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MADMK_F16 class methods ---
+
+    Inst_VOP2__V_MADMK_F16::Inst_VOP2__V_MADMK_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madmk_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADMK_F16
+
+    Inst_VOP2__V_MADMK_F16::~Inst_VOP2__V_MADMK_F16()
+    {
+    } // ~Inst_VOP2__V_MADMK_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored
+    // in the following literal DWORD.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // modifiers. Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP2__V_MADMK_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MADAK_F16 class methods ---
+
+    Inst_VOP2__V_MADAK_F16::Inst_VOP2__V_MADAK_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_madak_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAD);
+    } // Inst_VOP2__V_MADAK_F16
+
+    Inst_VOP2__V_MADAK_F16::~Inst_VOP2__V_MADAK_F16()
+    {
+    } // ~Inst_VOP2__V_MADAK_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored
+    // in the following literal DWORD.
+    // This opcode cannot use the VOP3 encoding and cannot use input/output
+    // modifiers. Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP2__V_MADAK_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_ADD_U16 class methods ---
+
+    Inst_VOP2__V_ADD_U16::Inst_VOP2__V_ADD_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ADD_U16
+
+    Inst_VOP2__V_ADD_U16::~Inst_VOP2__V_ADD_U16()
+    {
+    } // ~Inst_VOP2__V_ADD_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 + S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP2__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_U16 class methods ---
+
+    Inst_VOP2__V_SUB_U16::Inst_VOP2__V_SUB_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUB_U16
+
+    Inst_VOP2__V_SUB_U16::~Inst_VOP2__V_SUB_U16()
+    {
+    } // ~Inst_VOP2__V_SUB_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 - S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP2__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_U16 class methods ---
+
+    Inst_VOP2__V_SUBREV_U16::Inst_VOP2__V_SUBREV_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUBREV_U16
+
+    Inst_VOP2__V_SUBREV_U16::~Inst_VOP2__V_SUBREV_U16()
+    {
+    } // ~Inst_VOP2__V_SUBREV_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S1.u16 - S0.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    // SQ translates this to V_SUB_U16 with reversed operands.
+    void
+    Inst_VOP2__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MUL_LO_U16 class methods ---
+
+    Inst_VOP2__V_MUL_LO_U16::Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_mul_lo_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MUL_LO_U16
+
+    Inst_VOP2__V_MUL_LO_U16::~Inst_VOP2__V_MUL_LO_U16()
+    {
+    } // ~Inst_VOP2__V_MUL_LO_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 * S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP2__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHLREV_B16 class methods ---
+
+    Inst_VOP2__V_LSHLREV_B16::Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshlrev_b16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHLREV_B16
+
+    Inst_VOP2__V_LSHLREV_B16::~Inst_VOP2__V_LSHLREV_B16()
+    {
+    } // ~Inst_VOP2__V_LSHLREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LSHRREV_B16 class methods ---
+
+    Inst_VOP2__V_LSHRREV_B16::Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_lshrrev_b16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_LSHRREV_B16
+
+    Inst_VOP2__V_LSHRREV_B16::~Inst_VOP2__V_LSHRREV_B16()
+    {
+    } // ~Inst_VOP2__V_LSHRREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_ASHRREV_I16 class methods ---
+
+    Inst_VOP2__V_ASHRREV_I16::Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_ashrrev_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ASHRREV_I16
+
+    Inst_VOP2__V_ASHRREV_I16::~Inst_VOP2__V_ASHRREV_I16()
+    {
+    } // ~Inst_VOP2__V_ASHRREV_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP2__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_F16 class methods ---
+
+    Inst_VOP2__V_MAX_F16::Inst_VOP2__V_MAX_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_MAX_F16
+
+    Inst_VOP2__V_MAX_F16::~Inst_VOP2__V_MAX_F16()
+    {
+    } // ~Inst_VOP2__V_MAX_F16
+
+    // --- description from .arch file ---
+    // D.f16 = max(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP2__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MIN_F16 class methods ---
+
+    Inst_VOP2__V_MIN_F16::Inst_VOP2__V_MIN_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_MIN_F16
+
+    Inst_VOP2__V_MIN_F16::~Inst_VOP2__V_MIN_F16()
+    {
+    } // ~Inst_VOP2__V_MIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = min(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP2__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_MAX_U16 class methods ---
+
+    Inst_VOP2__V_MAX_U16::Inst_VOP2__V_MAX_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_U16
+
+    Inst_VOP2__V_MAX_U16::~Inst_VOP2__V_MAX_U16()
+    {
+    } // ~Inst_VOP2__V_MAX_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP2__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MAX_I16 class methods ---
+
+    Inst_VOP2__V_MAX_I16::Inst_VOP2__V_MAX_I16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_max_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MAX_I16
+
+    Inst_VOP2__V_MAX_I16::~Inst_VOP2__V_MAX_I16()
+    {
+    } // ~Inst_VOP2__V_MAX_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP2__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_U16 class methods ---
+
+    Inst_VOP2__V_MIN_U16::Inst_VOP2__V_MIN_U16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_U16
+
+    Inst_VOP2__V_MIN_U16::~Inst_VOP2__V_MIN_U16()
+    {
+    } // ~Inst_VOP2__V_MIN_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP2__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_MIN_I16 class methods ---
+
+    Inst_VOP2__V_MIN_I16::Inst_VOP2__V_MIN_I16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_min_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_MIN_I16
+
+    Inst_VOP2__V_MIN_I16::~Inst_VOP2__V_MIN_I16()
+    {
+    } // ~Inst_VOP2__V_MIN_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP2__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_LDEXP_F16 class methods ---
+
+    Inst_VOP2__V_LDEXP_F16::Inst_VOP2__V_LDEXP_F16(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_ldexp_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP2__V_LDEXP_F16
+
+    Inst_VOP2__V_LDEXP_F16::~Inst_VOP2__V_LDEXP_F16()
+    {
+    } // ~Inst_VOP2__V_LDEXP_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * (2 ** S1.i16).
+    void
+    Inst_VOP2__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP2__V_ADD_U32 class methods ---
+
+    Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_ADD_U32
+
+    Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()
+    {
+    } // ~Inst_VOP2__V_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    void
+    Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {
+            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and dest during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
+                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
+                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
+                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
+                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] + src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_U32 class methods ---
+
+    Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUB_U32
+
+    Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()
+    {
+    } // ~Inst_VOP2__V_SUB_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    void
+    Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_U32 class methods ---
+
+    Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_SUBREV_U32
+
+    Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()
+    {
+    } // ~Inst_VOP2__V_SUBREV_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    void
+    Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_FMAC_F32 class methods ---
+
+    Inst_VOP2__V_FMAC_F32::Inst_VOP2__V_FMAC_F32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_fmac_f32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_FMAC_F32
+
+    Inst_VOP2__V_FMAC_F32::~Inst_VOP2__V_FMAC_F32()
+    {
+    } // ~Inst_VOP2__V_FMAC_F32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    void
+    Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+        vdst.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_XNOR_B32 class methods ---
+
+    Inst_VOP2__V_XNOR_B32::Inst_VOP2__V_XNOR_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_xnor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_XNOR_B32
+
+    Inst_VOP2__V_XNOR_B32::~Inst_VOP2__V_XNOR_B32()
+    {
+    } // ~Inst_VOP2__V_XNOR_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    void
+    Inst_VOP2__V_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+        vdst.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ~(src0[lane] ^ src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc
new file mode 100644
index 0000000000..b9fee17353
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -0,0 +1,9109 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/common/dtype/mxfp_types.hh"
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP3__V_CNDMASK_B32 class methods ---
+
+    Inst_VOP3__V_CNDMASK_B32::Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cndmask_b32", false)
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_CNDMASK_B32
+
+    Inst_VOP3__V_CNDMASK_B32::~Inst_VOP3__V_CNDMASK_B32()
+    {
+    } // ~Inst_VOP3__V_CNDMASK_B32
+
+    // --- description from .arch file ---
+    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
+    // as a scalar GPR in S2.
+    void
+    Inst_VOP3__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = bits(vcc.rawData(), lane)
+                    ? src1[lane] : src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_F32 class methods ---
+
+    Inst_VOP3__V_ADD_F32::Inst_VOP3__V_ADD_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_ADD_F32
+
+    Inst_VOP3__V_ADD_F32::~Inst_VOP3__V_ADD_F32()
+    {
+    } // ~Inst_VOP3__V_ADD_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f + S1.f.
+    void
+    Inst_VOP3__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_F32 class methods ---
+
+    Inst_VOP3__V_SUB_F32::Inst_VOP3__V_SUB_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SUB_F32
+
+    Inst_VOP3__V_SUB_F32::~Inst_VOP3__V_SUB_F32()
+    {
+    } // ~Inst_VOP3__V_SUB_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - S1.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP3__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_F32 class methods ---
+
+    Inst_VOP3__V_SUBREV_F32::Inst_VOP3__V_SUBREV_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SUBREV_F32
+
+    Inst_VOP3__V_SUBREV_F32::~Inst_VOP3__V_SUBREV_F32()
+    {
+    } // ~Inst_VOP3__V_SUBREV_F32
+
+    // --- description from .arch file ---
+    // D.f = S1.f - S0.f.
+    // SQ translates to V_ADD_F32.
+    void
+    Inst_VOP3__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_MUL_LEGACY_F32::Inst_VOP3__V_MUL_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MUL_LEGACY_F32
+
+    Inst_VOP3__V_MUL_LEGACY_F32::~Inst_VOP3__V_MUL_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_MUL_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
+    void
+    Inst_VOP3__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_F32 class methods ---
+
+    Inst_VOP3__V_MUL_F32::Inst_VOP3__V_MUL_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MUL_F32
+
+    Inst_VOP3__V_MUL_F32::~Inst_VOP3__V_MUL_F32()
+    {
+    } // ~Inst_VOP3__V_MUL_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f.
+    void
+    Inst_VOP3__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_I32_I24 class methods ---
+
+    Inst_VOP3__V_MUL_I32_I24::Inst_VOP3__V_MUL_I32_I24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_i32_i24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_I32_I24
+
+    Inst_VOP3__V_MUL_I32_I24::~Inst_VOP3__V_MUL_I32_I24()
+    {
+    } // ~Inst_VOP3__V_MUL_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = S0.i[23:0] * S1.i[23:0].
+    void
+    Inst_VOP3__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
+                    * sext<24>(bits(src1[lane], 23, 0));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_I32_I24 class methods ---
+
+    Inst_VOP3__V_MUL_HI_I32_I24::Inst_VOP3__V_MUL_HI_I32_I24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_i32_i24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_I32_I24
+
+    Inst_VOP3__V_MUL_HI_I32_I24::~Inst_VOP3__V_MUL_HI_I32_I24()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
+    void
+    Inst_VOP3__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 tmp_src0
+                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
+                VecElemI64 tmp_src1
+                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
+
+                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_U32_U24 class methods ---
+
+    Inst_VOP3__V_MUL_U32_U24::Inst_VOP3__V_MUL_U32_U24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_u32_u24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_U32_U24
+
+    Inst_VOP3__V_MUL_U32_U24::~Inst_VOP3__V_MUL_U32_U24()
+    {
+    } // ~Inst_VOP3__V_MUL_U32_U24
+
+    // --- description from .arch file ---
+    // D.u = S0.u[23:0] * S1.u[23:0].
+    void
+    Inst_VOP3__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_U32_U24 class methods ---
+
+    Inst_VOP3__V_MUL_HI_U32_U24::Inst_VOP3__V_MUL_HI_U32_U24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_u32_u24", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_U32_U24
+
+    Inst_VOP3__V_MUL_HI_U32_U24::~Inst_VOP3__V_MUL_HI_U32_U24()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_U32_U24
+
+    // --- description from .arch file ---
+    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
+    void
+    Inst_VOP3__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
+                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
+                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_F32 class methods ---
+
+    Inst_VOP3__V_MIN_F32::Inst_VOP3__V_MIN_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MIN_F32
+
+    Inst_VOP3__V_MIN_F32::~Inst_VOP3__V_MIN_F32()
+    {
+    } // ~Inst_VOP3__V_MIN_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f < S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP3__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmin(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_F32 class methods ---
+
+    Inst_VOP3__V_MAX_F32::Inst_VOP3__V_MAX_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MAX_F32
+
+    Inst_VOP3__V_MAX_F32::~Inst_VOP3__V_MAX_F32()
+    {
+    } // ~Inst_VOP3__V_MAX_F32
+
+    // --- description from .arch file ---
+    // D.f = (S0.f >= S1.f ? S0.f : S1.f).
+    void
+    Inst_VOP3__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmax(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_I32 class methods ---
+
+    Inst_VOP3__V_MIN_I32::Inst_VOP3__V_MIN_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_I32
+
+    Inst_VOP3__V_MIN_I32::~Inst_VOP3__V_MIN_I32()
+    {
+    } // ~Inst_VOP3__V_MIN_I32
+
+    // --- description from .arch file ---
+    // D.i = min(S0.i, S1.i).
+    void
+    Inst_VOP3__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_I32 class methods ---
+
+    Inst_VOP3__V_MAX_I32::Inst_VOP3__V_MAX_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_I32
+
+    Inst_VOP3__V_MAX_I32::~Inst_VOP3__V_MAX_I32()
+    {
+    } // ~Inst_VOP3__V_MAX_I32
+
+    // --- description from .arch file ---
+    // D.i = max(S0.i, S1.i).
+    void
+    Inst_VOP3__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_U32 class methods ---
+
+    Inst_VOP3__V_MIN_U32::Inst_VOP3__V_MIN_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_U32
+
+    Inst_VOP3__V_MIN_U32::~Inst_VOP3__V_MIN_U32()
+    {
+    } // ~Inst_VOP3__V_MIN_U32
+
+    // --- description from .arch file ---
+    // D.u = min(S0.u, S1.u).
+    void
+    Inst_VOP3__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_U32 class methods ---
+
+    Inst_VOP3__V_MAX_U32::Inst_VOP3__V_MAX_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_U32
+
+    Inst_VOP3__V_MAX_U32::~Inst_VOP3__V_MAX_U32()
+    {
+    } // ~Inst_VOP3__V_MAX_U32
+
+    // --- description from .arch file ---
+    // D.u = max(S0.u, S1.u).
+    void
+    Inst_VOP3__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHRREV_B32 class methods ---
+
+    Inst_VOP3__V_LSHRREV_B32::Inst_VOP3__V_LSHRREV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshrrev_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHRREV_B32
+
+    Inst_VOP3__V_LSHRREV_B32::~Inst_VOP3__V_LSHRREV_B32()
+    {
+    } // ~Inst_VOP3__V_LSHRREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u >> S0.u[4:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ASHRREV_I32 class methods ---
+
+    Inst_VOP3__V_ASHRREV_I32::Inst_VOP3__V_ASHRREV_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ashrrev_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ASHRREV_I32
+
+    Inst_VOP3__V_ASHRREV_I32::~Inst_VOP3__V_ASHRREV_I32()
+    {
+    } // ~Inst_VOP3__V_ASHRREV_I32
+
+    // --- description from .arch file ---
+    // D.i = signext(S1.i) >> S0.i[4:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHLREV_B32 class methods ---
+
+    Inst_VOP3__V_LSHLREV_B32::Inst_VOP3__V_LSHLREV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshlrev_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHLREV_B32
+
+    Inst_VOP3__V_LSHLREV_B32::~Inst_VOP3__V_LSHLREV_B32()
+    {
+    } // ~Inst_VOP3__V_LSHLREV_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u << S0.u[4:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_AND_B32 class methods ---
+
+    Inst_VOP3__V_AND_B32::Inst_VOP3__V_AND_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_and_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_AND_B32
+
+    Inst_VOP3__V_AND_B32::~Inst_VOP3__V_AND_B32()
+    {
+    } // ~Inst_VOP3__V_AND_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u & S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] & src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_OR_B32 class methods ---
+
+    Inst_VOP3__V_OR_B32::Inst_VOP3__V_OR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_or_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_OR_B32
+
+    Inst_VOP3__V_OR_B32::~Inst_VOP3__V_OR_B32()
+    {
+    } // ~Inst_VOP3__V_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] | src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_OR3_B32 class methods ---
+
+    Inst_VOP3__V_OR3_B32::Inst_VOP3__V_OR3_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_or3_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_OR3_B32
+
+    Inst_VOP3__V_OR3_B32::~Inst_VOP3__V_OR3_B32()
+    {
+    } // ~Inst_VOP3__V_OR3_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u | S1.u | S2.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_OR3_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] | src1[lane] | src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_XOR_B32 class methods ---
+
+    Inst_VOP3__V_XOR_B32::Inst_VOP3__V_XOR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_xor_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_XOR_B32
+
+    Inst_VOP3__V_XOR_B32::~Inst_VOP3__V_XOR_B32()
+    {
+    } // ~Inst_VOP3__V_XOR_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u ^ S1.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] ^ src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAC_F32 class methods ---
+
+    Inst_VOP3__V_MAC_F32::Inst_VOP3__V_MAC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mac_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAC);
+    } // Inst_VOP3__V_MAC_F32
+
+    Inst_VOP3__V_MAC_F32::~Inst_VOP3__V_MAC_F32()
+    {
+    } // ~Inst_VOP3__V_MAC_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + D.f.
+    // SQ translates to V_MAD_F32.
+    void
+    Inst_VOP3__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vdst.read();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_CO_U32 class methods ---
+
+    Inst_VOP3__V_ADD_CO_U32::Inst_VOP3__V_ADD_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_add_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP3__V_ADD_CO_U32
+
+    Inst_VOP3__V_ADD_CO_U32::~Inst_VOP3__V_ADD_CO_U32()
+    {
+    } // ~Inst_VOP3__V_ADD_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
+    // ---  overflow or carry-out for V_ADDC_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP3__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+                vcc.setBit(lane, ((VecElemU64)src0[lane]
+                    + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUB_CO_U32::Inst_VOP3__V_SUB_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_sub_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP3__V_SUB_CO_U32
+
+    Inst_VOP3__V_SUB_CO_U32::~Inst_VOP3__V_SUB_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    void
+    Inst_VOP3__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUBREV_CO_U32::Inst_VOP3__V_SUBREV_CO_U32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_subrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+    } // Inst_VOP3__V_SUBREV_CO_U32
+
+    Inst_VOP3__V_SUBREV_CO_U32::~Inst_VOP3__V_SUBREV_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
+    // carry-out for V_SUBB_U32.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
+    // SQ translates this to V_SUB_U32 with reversed operands.
+    void
+    Inst_VOP3__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        vcc.write();
+    } // execute
+    // --- Inst_VOP3__V_ADDC_CO_U32 class methods ---
+
+    Inst_VOP3__V_ADDC_CO_U32::Inst_VOP3__V_ADDC_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_addc_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_ADDC_CO_U32
+
+    Inst_VOP3__V_ADDC_CO_U32::~Inst_VOP3__V_ADDC_CO_U32()
+    {
+    } // ~Inst_VOP3__V_ADDC_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + VCC[threadId];
+    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
+    // is an UNSIGNED overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP3__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane]
+                    + bits(vcc.rawData(), lane);
+                sdst.setBit(lane, ((VecElemU64)src0[lane]
+                    + (VecElemU64)src1[lane]
+                        + (VecElemU64)bits(vcc.rawData(), lane))
+                            >= 0x100000000 ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBB_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUBB_CO_U32::Inst_VOP3__V_SUBB_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_subb_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_SUBB_CO_U32
+
+    Inst_VOP3__V_SUBB_CO_U32::~Inst_VOP3__V_SUBB_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUBB_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // ---  overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // ---  source comes from the SGPR-pair at S2.u.
+    void
+    Inst_VOP3__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane]
+                    - bits(vcc.rawData(), lane);
+                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
+                    > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBBREV_CO_U32 class methods ---
+
+    Inst_VOP3__V_SUBBREV_CO_U32::Inst_VOP3__V_SUBBREV_CO_U32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_subbrev_co_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(ReadsVCC);
+    } // Inst_VOP3__V_SUBBREV_CO_U32
+
+    Inst_VOP3__V_SUBBREV_CO_U32::~Inst_VOP3__V_SUBBREV_CO_U32()
+    {
+    } // ~Inst_VOP3__V_SUBBREV_CO_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u - VCC[threadId];
+    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
+    // overflow.
+    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
+    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
+    void
+    Inst_VOP3__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
+        ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vcc.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane]
+                    - bits(vcc.rawData(), lane);
+                sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
+                    > src0[lane] ? 1 : 0);
+            }
+        }
+
+        vdst.write();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_F16 class methods ---
+
+    Inst_VOP3__V_ADD_F16::Inst_VOP3__V_ADD_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_ADD_F16
+
+    Inst_VOP3__V_ADD_F16::~Inst_VOP3__V_ADD_F16()
+    {
+    } // ~Inst_VOP3__V_ADD_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP3__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SUB_F16 class methods ---
+
+    Inst_VOP3__V_SUB_F16::Inst_VOP3__V_SUB_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SUB_F16
+
+    Inst_VOP3__V_SUB_F16::~Inst_VOP3__V_SUB_F16()
+    {
+    } // ~Inst_VOP3__V_SUB_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 - S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP3__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_F16 class methods ---
+
+    Inst_VOP3__V_SUBREV_F16::Inst_VOP3__V_SUBREV_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SUBREV_F16
+
+    Inst_VOP3__V_SUBREV_F16::~Inst_VOP3__V_SUBREV_F16()
+    {
+    } // ~Inst_VOP3__V_SUBREV_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S1.f16 - S0.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    // SQ translates to V_ADD_F16.
+    void
+    Inst_VOP3__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MUL_F16 class methods ---
+
+    Inst_VOP3__V_MUL_F16::Inst_VOP3__V_MUL_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_MUL_F16
+
+    Inst_VOP3__V_MUL_F16::~Inst_VOP3__V_MUL_F16()
+    {
+    } // ~Inst_VOP3__V_MUL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16.
+    // Supports denormals, round mode, exception flags, saturation.
+    void
+    Inst_VOP3__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAC_F16 class methods ---
+
+    Inst_VOP3__V_MAC_F16::Inst_VOP3__V_MAC_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mac_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAC);
+    } // Inst_VOP3__V_MAC_F16
+
+    Inst_VOP3__V_MAC_F16::~Inst_VOP3__V_MAC_F16()
+    {
+    } // ~Inst_VOP3__V_MAC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + D.f16.
+    // Supports round mode, exception flags, saturation.
+    // SQ translates this to V_MAD_F16.
+    void
+    Inst_VOP3__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_ADD_U16 class methods ---
+
+    Inst_VOP3__V_ADD_U16::Inst_VOP3__V_ADD_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD_U16
+
+    Inst_VOP3__V_ADD_U16::~Inst_VOP3__V_ADD_U16()
+    {
+    } // ~Inst_VOP3__V_ADD_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 + S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_U16 class methods ---
+
+    Inst_VOP3__V_SUB_U16::Inst_VOP3__V_SUB_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUB_U16
+
+    Inst_VOP3__V_SUB_U16::~Inst_VOP3__V_SUB_U16()
+    {
+    } // ~Inst_VOP3__V_SUB_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 - S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_U16 class methods ---
+
+    Inst_VOP3__V_SUBREV_U16::Inst_VOP3__V_SUBREV_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUBREV_U16
+
+    Inst_VOP3__V_SUBREV_U16::~Inst_VOP3__V_SUBREV_U16()
+    {
+    } // ~Inst_VOP3__V_SUBREV_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S1.u16 - S0.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    // SQ translates this to V_SUB_U16 with reversed operands.
+    void
+    Inst_VOP3__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_LO_U16 class methods ---
+
+    Inst_VOP3__V_MUL_LO_U16::Inst_VOP3__V_MUL_LO_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_lo_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_LO_U16
+
+    Inst_VOP3__V_MUL_LO_U16::~Inst_VOP3__V_MUL_LO_U16()
+    {
+    } // ~Inst_VOP3__V_MUL_LO_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 * S1.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHLREV_B16 class methods ---
+
+    Inst_VOP3__V_LSHLREV_B16::Inst_VOP3__V_LSHLREV_B16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshlrev_b16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHLREV_B16
+
+    Inst_VOP3__V_LSHLREV_B16::~Inst_VOP3__V_LSHLREV_B16()
+    {
+    } // ~Inst_VOP3__V_LSHLREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHRREV_B16 class methods ---
+
+    Inst_VOP3__V_LSHRREV_B16::Inst_VOP3__V_LSHRREV_B16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshrrev_b16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHRREV_B16
+
+    Inst_VOP3__V_LSHRREV_B16::~Inst_VOP3__V_LSHRREV_B16()
+    {
+    } // ~Inst_VOP3__V_LSHRREV_B16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ASHRREV_I16 class methods ---
+
+    Inst_VOP3__V_ASHRREV_I16::Inst_VOP3__V_ASHRREV_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ashrrev_i16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ASHRREV_I16
+
+    Inst_VOP3__V_ASHRREV_I16::~Inst_VOP3__V_ASHRREV_I16()
+    {
+    } // ~Inst_VOP3__V_ASHRREV_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_F16 class methods ---
+
+    Inst_VOP3__V_MAX_F16::Inst_VOP3__V_MAX_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_MAX_F16
+
+    Inst_VOP3__V_MAX_F16::~Inst_VOP3__V_MAX_F16()
+    {
+    } // ~Inst_VOP3__V_MAX_F16
+
+    // --- description from .arch file ---
+    // D.f16 = max(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP3__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MIN_F16 class methods ---
+
+    Inst_VOP3__V_MIN_F16::Inst_VOP3__V_MIN_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_MIN_F16
+
+    Inst_VOP3__V_MIN_F16::~Inst_VOP3__V_MIN_F16()
+    {
+    } // ~Inst_VOP3__V_MIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = min(S0.f16, S1.f16).
+    // IEEE compliant. Supports denormals, round mode, exception flags,
+    // saturation.
+    void
+    Inst_VOP3__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAX_U16 class methods ---
+
+    Inst_VOP3__V_MAX_U16::Inst_VOP3__V_MAX_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_U16
+
+    Inst_VOP3__V_MAX_U16::~Inst_VOP3__V_MAX_U16()
+    {
+    } // ~Inst_VOP3__V_MAX_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP3__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_I16 class methods ---
+
+    Inst_VOP3__V_MAX_I16::Inst_VOP3__V_MAX_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_i16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX_I16
+
+    Inst_VOP3__V_MAX_I16::~Inst_VOP3__V_MAX_I16()
+    {
+    } // ~Inst_VOP3__V_MAX_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP3__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::max(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_U16 class methods ---
+
+    Inst_VOP3__V_MIN_U16::Inst_VOP3__V_MIN_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_U16
+
+    Inst_VOP3__V_MIN_U16::~Inst_VOP3__V_MIN_U16()
+    {
+    } // ~Inst_VOP3__V_MIN_U16
+
+    // --- description from .arch file ---
+    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
+    void
+    Inst_VOP3__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_I16 class methods ---
+
+    Inst_VOP3__V_MIN_I16::Inst_VOP3__V_MIN_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_i16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN_I16
+
+    Inst_VOP3__V_MIN_I16::~Inst_VOP3__V_MIN_I16()
+    {
+    } // ~Inst_VOP3__V_MIN_I16
+
+    // --- description from .arch file ---
+    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
+    void
+    Inst_VOP3__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::min(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LDEXP_F16 class methods ---
+
+    Inst_VOP3__V_LDEXP_F16::Inst_VOP3__V_LDEXP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ldexp_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_LDEXP_F16
+
+    Inst_VOP3__V_LDEXP_F16::~Inst_VOP3__V_LDEXP_F16()
+    {
+    } // ~Inst_VOP3__V_LDEXP_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * (2 ** S1.i16).
+    void
+    Inst_VOP3__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_ADD_U32 class methods ---
+
+    Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD_U32
+
+    Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32()
+    {
+    } // ~Inst_VOP3__V_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S0.u32 + S1.u32.
+    void
+    Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_U32 class methods ---
+
+    Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUB_U32
+
+    Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32()
+    {
+    } // ~Inst_VOP3__V_SUB_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S0.u32 - S1.u32.
+    void
+    Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_U32 class methods ---
+
+    Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUBREV_U32
+
+    Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32()
+    {
+    } // ~Inst_VOP3__V_SUBREV_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S1.u32 - S0.u32.
+    void
+    Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FMAC_F32 class methods ---
+
+    Inst_VOP3__V_FMAC_F32::Inst_VOP3__V_FMAC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fmac_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMAC_F32
+
+    Inst_VOP3__V_FMAC_F32::~Inst_VOP3__V_FMAC_F32()
+    {
+    } // ~Inst_VOP3__V_FMAC_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + D.f.
+    void
+    Inst_VOP3__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vdst.read();
+
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+        panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            vdst.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            vdst.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                float out = std::fma(src0[lane], src1[lane], vdst[lane]);
+                out = omodModifier(out, extData.OMOD);
+                out = std::clamp(vdst[lane], 0.0f, 1.0f);
+                vdst[lane] = out;
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_NOP class methods ---
+
+    Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_nop", false)
+    {
+        setFlag(Nop);
+        setFlag(ALU);
+    } // Inst_VOP3__V_NOP
+
+    Inst_VOP3__V_NOP::~Inst_VOP3__V_NOP()
+    {
+    } // ~Inst_VOP3__V_NOP
+
+    // --- description from .arch file ---
+    // Do nothing.
+    void
+    Inst_VOP3__V_NOP::execute(GPUDynInstPtr gpuDynInst)
+    {
+    } // execute
+    // --- Inst_VOP3__V_MOV_B32 class methods ---
+
+    Inst_VOP3__V_MOV_B32::Inst_VOP3__V_MOV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mov_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MOV_B32
+
+    Inst_VOP3__V_MOV_B32::~Inst_VOP3__V_MOV_B32()
+    {
+    } // ~Inst_VOP3__V_MOV_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP3__V_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_I32_F64 class methods ---
+
+    Inst_VOP3__V_CVT_I32_F64::Inst_VOP3__V_CVT_I32_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_i32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_I32_F64
+
+    Inst_VOP3__V_CVT_I32_F64::~Inst_VOP3__V_CVT_I32_F64()
+    {
+    } // ~Inst_VOP3__V_CVT_I32_F64
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F64_I32 class methods ---
+
+    Inst_VOP3__V_CVT_F64_I32::Inst_VOP3__V_CVT_F64_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f64_i32", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F64_I32
+
+    Inst_VOP3__V_CVT_F64_I32::~Inst_VOP3__V_CVT_F64_I32()
+    {
+    } // ~Inst_VOP3__V_CVT_F64_I32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.i.
+    void
+    Inst_VOP3__V_CVT_F64_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_I32 class methods ---
+
+    Inst_VOP3__V_CVT_F32_I32::Inst_VOP3__V_CVT_F32_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_i32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_I32
+
+    Inst_VOP3__V_CVT_F32_I32::~Inst_VOP3__V_CVT_F32_I32()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_I32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.i.
+    void
+    Inst_VOP3__V_CVT_F32_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        VecOperandI32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_U32 class methods ---
+
+    Inst_VOP3__V_CVT_F32_U32::Inst_VOP3__V_CVT_F32_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_u32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_U32
+
+    Inst_VOP3__V_CVT_F32_U32::~Inst_VOP3__V_CVT_F32_U32()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_U32
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.u.
+    void
+    Inst_VOP3__V_CVT_F32_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_U32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_U32_F32::Inst_VOP3__V_CVT_U32_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_u32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_U32_F32
+
+    Inst_VOP3__V_CVT_U32_F32::~Inst_VOP3__V_CVT_U32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_U32_F32
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_U32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_I32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_I32_F32::Inst_VOP3__V_CVT_I32_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_I32_F32
+
+    Inst_VOP3__V_CVT_I32_F32::~Inst_VOP3__V_CVT_I32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)S0.f.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane]) || exp > 30) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = INT_MIN;
+                    } else {
+                        vdst[lane] = INT_MAX;
+                    }
+                } else {
+                    vdst[lane] = (VecElemI32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MOV_FED_B32 class methods ---
+
+    Inst_VOP3__V_MOV_FED_B32::Inst_VOP3__V_MOV_FED_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mov_fed_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MOV_FED_B32
+
+    Inst_VOP3__V_MOV_FED_B32::~Inst_VOP3__V_MOV_FED_B32()
+    {
+    } // ~Inst_VOP3__V_MOV_FED_B32
+
+    // --- description from .arch file ---
+    // D.u = S0.u;
+    // Introduce EDC double error upon write to dest vgpr without causing an
+    // ---  exception.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP3__V_MOV_FED_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_F16_F32::Inst_VOP3__V_CVT_F16_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F16_F32
+
+    Inst_VOP3__V_CVT_F16_F32::~Inst_VOP3__V_CVT_F16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_F16_F32
+
+    // --- description from .arch file ---
+    // D.f16 = flt32_to_flt16(S0.f).
+    // Supports input modifiers and creates FP16 denormals when appropriate.
+    void
+    Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        vdst.read();
+
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+
+        unsigned abs = instData.ABS;
+        unsigned neg = extData.NEG;
+        int opsel = instData.OPSEL;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                float tmp = src0[lane];
+
+                if ((abs & 1) && (tmp < 0)) tmp = -tmp;
+                if (neg & 1) tmp = -tmp;
+
+                tmp = omodModifier(tmp, extData.OMOD);
+                tmp = std::clamp(tmp, 0.0f, 1.0f);
+
+                AMDGPU::mxfloat16 out(tmp);
+
+                // If opsel[3] use upper 16-bits of dest, otherwise lower.
+                if (opsel & 8) {
+                    replaceBits(vdst[lane], 31, 16, (out.data >> 16));
+                } else {
+                    replaceBits(vdst[lane], 15, 0, (out.data >> 16));
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_F16 class methods ---
+
+    Inst_VOP3__V_CVT_F32_F16::Inst_VOP3__V_CVT_F32_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_F16
+
+    Inst_VOP3__V_CVT_F32_F16::~Inst_VOP3__V_CVT_F32_F16()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_F16
+
+    // --- description from .arch file ---
+    // D.f = flt16_to_flt32(S0.f16).
+    // FP16 denormal inputs are always accepted.
+    void
+    Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+
+        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+        panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
+
+        unsigned abs = instData.ABS;
+        unsigned neg = extData.NEG;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                AMDGPU::mxfloat16 tmp(src0[lane]);
+
+                if ((abs & 1) && (tmp < 0)) tmp = -tmp;
+                if (neg & 1) tmp = -tmp;
+
+                float out = omodModifier(float(tmp), extData.OMOD);
+                out = std::clamp(out, 0.0f, 1.0f);
+
+                vdst[lane] = out;
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_RPI_I32_F32::Inst_VOP3__V_CVT_RPI_I32_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_rpi_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_RPI_I32_F32
+
+    Inst_VOP3__V_CVT_RPI_I32_F32::~Inst_VOP3__V_CVT_RPI_I32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_RPI_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f + 0.5).
+    void
+    Inst_VOP3__V_CVT_RPI_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_FLR_I32_F32 class methods ---
+
+    Inst_VOP3__V_CVT_FLR_I32_F32::Inst_VOP3__V_CVT_FLR_I32_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_flr_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_FLR_I32_F32
+
+    Inst_VOP3__V_CVT_FLR_I32_F32::~Inst_VOP3__V_CVT_FLR_I32_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_FLR_I32_F32
+
+    // --- description from .arch file ---
+    // D.i = (int)floor(S0.f).
+    void
+    Inst_VOP3__V_CVT_FLR_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemI32)std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_OFF_F32_I4 class methods ---
+
+    Inst_VOP3__V_CVT_OFF_F32_I4::Inst_VOP3__V_CVT_OFF_F32_I4(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_off_f32_i4", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_OFF_F32_I4
+
+    Inst_VOP3__V_CVT_OFF_F32_I4::~Inst_VOP3__V_CVT_OFF_F32_I4()
+    {
+    } // ~Inst_VOP3__V_CVT_OFF_F32_I4
+
+    // --- description from .arch file ---
+    // 4-bit signed int to 32-bit float. Used for interpolation in shader.
+    void
+    Inst_VOP3__V_CVT_OFF_F32_I4::execute(GPUDynInstPtr gpuDynInst)
+    {
+        // Could not parse sq_uc.arch desc field
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_F64 class methods ---
+
+    Inst_VOP3__V_CVT_F32_F64::Inst_VOP3__V_CVT_F32_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F32_F64
+
+    Inst_VOP3__V_CVT_F32_F64::~Inst_VOP3__V_CVT_F32_F64()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_F64
+
+    // --- description from .arch file ---
+    // D.f = (float)S0.d.
+    void
+    Inst_VOP3__V_CVT_F32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F64_F32 class methods ---
+
+    Inst_VOP3__V_CVT_F64_F32::Inst_VOP3__V_CVT_F64_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f64_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F64_F32
+
+    Inst_VOP3__V_CVT_F64_F32::~Inst_VOP3__V_CVT_F64_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_F64_F32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.f.
+    void
+    Inst_VOP3__V_CVT_F64_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE0 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE0::Inst_VOP3__V_CVT_F32_UBYTE0(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte0", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE0
+
+    Inst_VOP3__V_CVT_F32_UBYTE0::~Inst_VOP3__V_CVT_F32_UBYTE0()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE0
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[7:0]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE0::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE1 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE1::Inst_VOP3__V_CVT_F32_UBYTE1(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte1", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE1
+
+    Inst_VOP3__V_CVT_F32_UBYTE1::~Inst_VOP3__V_CVT_F32_UBYTE1()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE1
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[15:8]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE1::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE2 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE2::Inst_VOP3__V_CVT_F32_UBYTE2(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte2", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE2
+
+    Inst_VOP3__V_CVT_F32_UBYTE2::~Inst_VOP3__V_CVT_F32_UBYTE2()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE2
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[23:16]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE2::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F32_UBYTE3 class methods ---
+
+    Inst_VOP3__V_CVT_F32_UBYTE3::Inst_VOP3__V_CVT_F32_UBYTE3(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte3", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_F32_UBYTE3
+
+    Inst_VOP3__V_CVT_F32_UBYTE3::~Inst_VOP3__V_CVT_F32_UBYTE3()
+    {
+    } // ~Inst_VOP3__V_CVT_F32_UBYTE3
+
+    // --- description from .arch file ---
+    // D.f = (float)(S0.u[31:24]).
+    void
+    Inst_VOP3__V_CVT_F32_UBYTE3::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_U32_F64 class methods ---
+
+    Inst_VOP3__V_CVT_U32_F64::Inst_VOP3__V_CVT_U32_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_u32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_U32_F64
+
+    Inst_VOP3__V_CVT_U32_F64::~Inst_VOP3__V_CVT_U32_F64()
+    {
+    } // ~Inst_VOP3__V_CVT_U32_F64
+
+    // --- description from .arch file ---
+    // D.u = (unsigned)S0.d.
+    // Out-of-range floating point values (including infinity) saturate. NaN is
+    // ---  converted to 0.
+    void
+    Inst_VOP3__V_CVT_U32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp;
+                std::frexp(src[lane],&exp);
+                if (std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = 0;
+                    } else {
+                        vdst[lane] = UINT_MAX;
+                    }
+                } else if (exp > 31) {
+                    vdst[lane] = UINT_MAX;
+                } else {
+                    vdst[lane] = (VecElemU32)src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F64_U32 class methods ---
+
+    Inst_VOP3__V_CVT_F64_U32::Inst_VOP3__V_CVT_F64_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f64_u32", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CVT_F64_U32
+
+    Inst_VOP3__V_CVT_F64_U32::~Inst_VOP3__V_CVT_F64_U32()
+    {
+    } // ~Inst_VOP3__V_CVT_F64_U32
+
+    // --- description from .arch file ---
+    // D.d = (double)S0.u.
+    void
+    Inst_VOP3__V_CVT_F64_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (VecElemF64)src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_TRUNC_F64 class methods ---
+
+    Inst_VOP3__V_TRUNC_F64::Inst_VOP3__V_TRUNC_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trunc_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_TRUNC_F64
+
+    Inst_VOP3__V_TRUNC_F64::~Inst_VOP3__V_TRUNC_F64()
+    {
+    } // ~Inst_VOP3__V_TRUNC_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d), return integer part of S0.d.
+    void
+    Inst_VOP3__V_TRUNC_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CEIL_F64 class methods ---
+
+    Inst_VOP3__V_CEIL_F64::Inst_VOP3__V_CEIL_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ceil_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CEIL_F64
+
+    Inst_VOP3__V_CEIL_F64::~Inst_VOP3__V_CEIL_F64()
+    {
+    } // ~Inst_VOP3__V_CEIL_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
+    void
+    Inst_VOP3__V_CEIL_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RNDNE_F64 class methods ---
+
+    Inst_VOP3__V_RNDNE_F64::Inst_VOP3__V_RNDNE_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rndne_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_RNDNE_F64
+
+    Inst_VOP3__V_RNDNE_F64::~Inst_VOP3__V_RNDNE_F64()
+    {
+    } // ~Inst_VOP3__V_RNDNE_F64
+
+    // --- description from .arch file ---
+    // D.d = round_nearest_even(S0.d).
+    void
+    Inst_VOP3__V_RNDNE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FLOOR_F64 class methods ---
+
+    Inst_VOP3__V_FLOOR_F64::Inst_VOP3__V_FLOOR_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_floor_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FLOOR_F64
+
+    Inst_VOP3__V_FLOOR_F64::~Inst_VOP3__V_FLOOR_F64()
+    {
+    } // ~Inst_VOP3__V_FLOOR_F64
+
+    // --- description from .arch file ---
+    // D.d = trunc(S0.d);
+    // if (S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
+    void
+    Inst_VOP3__V_FLOOR_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FRACT_F32 class methods ---
+
+    Inst_VOP3__V_FRACT_F32::Inst_VOP3__V_FRACT_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fract_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FRACT_F32
+
+    Inst_VOP3__V_FRACT_F32::~Inst_VOP3__V_FRACT_F32()
+    {
+    } // ~Inst_VOP3__V_FRACT_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f - floor(S0.f).
+    void
+    Inst_VOP3__V_FRACT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_TRUNC_F32 class methods ---
+
+    Inst_VOP3__V_TRUNC_F32::Inst_VOP3__V_TRUNC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trunc_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_TRUNC_F32
+
+    Inst_VOP3__V_TRUNC_F32::~Inst_VOP3__V_TRUNC_F32()
+    {
+    } // ~Inst_VOP3__V_TRUNC_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f), return integer part of S0.f.
+    void
+    Inst_VOP3__V_TRUNC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::trunc(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CEIL_F32 class methods ---
+
+    Inst_VOP3__V_CEIL_F32::Inst_VOP3__V_CEIL_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ceil_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CEIL_F32
+
+    Inst_VOP3__V_CEIL_F32::~Inst_VOP3__V_CEIL_F32()
+    {
+    } // ~Inst_VOP3__V_CEIL_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
+    void
+    Inst_VOP3__V_CEIL_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ceil(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RNDNE_F32 class methods ---
+
+    Inst_VOP3__V_RNDNE_F32::Inst_VOP3__V_RNDNE_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rndne_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RNDNE_F32
+
+    Inst_VOP3__V_RNDNE_F32::~Inst_VOP3__V_RNDNE_F32()
+    {
+    } // ~Inst_VOP3__V_RNDNE_F32
+
+    // --- description from .arch file ---
+    // D.f = round_nearest_even(S0.f).
+    void
+    Inst_VOP3__V_RNDNE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = roundNearestEven(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FLOOR_F32 class methods ---
+
+    Inst_VOP3__V_FLOOR_F32::Inst_VOP3__V_FLOOR_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_floor_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FLOOR_F32
+
+    Inst_VOP3__V_FLOOR_F32::~Inst_VOP3__V_FLOOR_F32()
+    {
+    } // ~Inst_VOP3__V_FLOOR_F32
+
+    // --- description from .arch file ---
+    // D.f = trunc(S0.f);
+    // if (S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
+    void
+    Inst_VOP3__V_FLOOR_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::floor(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_EXP_F32 class methods ---
+
+    Inst_VOP3__V_EXP_F32::Inst_VOP3__V_EXP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_exp_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_EXP_F32
+
+    Inst_VOP3__V_EXP_F32::~Inst_VOP3__V_EXP_F32()
+    {
+    } // ~Inst_VOP3__V_EXP_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f).
+    void
+    Inst_VOP3__V_EXP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LOG_F32 class methods ---
+
+    Inst_VOP3__V_LOG_F32::Inst_VOP3__V_LOG_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_log_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_LOG_F32
+
+    Inst_VOP3__V_LOG_F32::~Inst_VOP3__V_LOG_F32()
+    {
+    } // ~Inst_VOP3__V_LOG_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm.
+    void
+    Inst_VOP3__V_LOG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RCP_F32 class methods ---
+
+    Inst_VOP3__V_RCP_F32::Inst_VOP3__V_RCP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RCP_F32
+
+    Inst_VOP3__V_RCP_F32::~Inst_VOP3__V_RCP_F32()
+    {
+    } // ~Inst_VOP3__V_RCP_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
+    void
+    Inst_VOP3__V_RCP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RCP_IFLAG_F32 class methods ---
+
+    Inst_VOP3__V_RCP_IFLAG_F32::Inst_VOP3__V_RCP_IFLAG_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_iflag_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RCP_IFLAG_F32
+
+    Inst_VOP3__V_RCP_IFLAG_F32::~Inst_VOP3__V_RCP_IFLAG_F32()
+    {
+    } // ~Inst_VOP3__V_RCP_IFLAG_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
+    // ---  integer DIV_BY_ZERO exception but cannot raise floating-point
+    // ---  exceptions.
+    void
+    Inst_VOP3__V_RCP_IFLAG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RSQ_F32 class methods ---
+
+    Inst_VOP3__V_RSQ_F32::Inst_VOP3__V_RSQ_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rsq_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_RSQ_F32
+
+    Inst_VOP3__V_RSQ_F32::~Inst_VOP3__V_RSQ_F32()
+    {
+    } // ~Inst_VOP3__V_RSQ_F32
+
+    // --- description from .arch file ---
+    // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
+    void
+    Inst_VOP3__V_RSQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = 1.0 / std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RCP_F64 class methods ---
+
+    Inst_VOP3__V_RCP_F64::Inst_VOP3__V_RCP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_RCP_F64
+
+    Inst_VOP3__V_RCP_F64::~Inst_VOP3__V_RCP_F64()
+    {
+    } // ~Inst_VOP3__V_RCP_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / S0.d.
+    void
+    Inst_VOP3__V_RCP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane])) {
+                    if (std::signbit(src[lane])) {
+                        vdst[lane] = -0.0;
+                    } else {
+                        vdst[lane] = 0.0;
+                    }
+                } else {
+                    vdst[lane] = 1.0 / src[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_RSQ_F64 class methods ---
+
+    Inst_VOP3__V_RSQ_F64::Inst_VOP3__V_RSQ_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rsq_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_RSQ_F64
+
+    Inst_VOP3__V_RSQ_F64::~Inst_VOP3__V_RSQ_F64()
+    {
+    } // ~Inst_VOP3__V_RSQ_F64
+
+    // --- description from .arch file ---
+    // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
+    void
+    Inst_VOP3__V_RSQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src[lane]) == FP_ZERO) {
+                    vdst[lane] = +INFINITY;
+                } else if (std::isnan(src[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
+                    vdst[lane] = 0.0;
+                } else if (std::signbit(src[lane])) {
+                    vdst[lane] = NAN;
+                } else {
+                    vdst[lane] = 1.0 / std::sqrt(src[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SQRT_F32 class methods ---
+
+    Inst_VOP3__V_SQRT_F32::Inst_VOP3__V_SQRT_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sqrt_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SQRT_F32
+
+    Inst_VOP3__V_SQRT_F32::~Inst_VOP3__V_SQRT_F32()
+    {
+    } // ~Inst_VOP3__V_SQRT_F32
+
+    // --- description from .arch file ---
+    // D.f = sqrt(S0.f).
+    void
+    Inst_VOP3__V_SQRT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SQRT_F64 class methods ---
+
+    Inst_VOP3__V_SQRT_F64::Inst_VOP3__V_SQRT_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sqrt_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_SQRT_F64
+
+    Inst_VOP3__V_SQRT_F64::~Inst_VOP3__V_SQRT_F64()
+    {
+    } // ~Inst_VOP3__V_SQRT_F64
+
+    // --- description from .arch file ---
+    // D.d = sqrt(S0.d).
+    void
+    Inst_VOP3__V_SQRT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sqrt(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SIN_F32 class methods ---
+
+    Inst_VOP3__V_SIN_F32::Inst_VOP3__V_SIN_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sin_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_SIN_F32
+
+    Inst_VOP3__V_SIN_F32::~Inst_VOP3__V_SIN_F32()
+    {
+    } // ~Inst_VOP3__V_SIN_F32
+
+    // --- description from .arch file ---
+    // D.f = sin(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 0.0.
+    void
+    Inst_VOP3__V_SIN_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_COS_F32 class methods ---
+
+    Inst_VOP3__V_COS_F32::Inst_VOP3__V_COS_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cos_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_COS_F32
+
+    Inst_VOP3__V_COS_F32::~Inst_VOP3__V_COS_F32()
+    {
+    } // ~Inst_VOP3__V_COS_F32
+
+    // --- description from .arch file ---
+    // D.f = cos(S0.f * 2 * PI).
+    // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
+    // float 1.0.
+    void
+    Inst_VOP3__V_COS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+        pi.read();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_NOT_B32 class methods ---
+
+    Inst_VOP3__V_NOT_B32::Inst_VOP3__V_NOT_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_not_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_NOT_B32
+
+    Inst_VOP3__V_NOT_B32::~Inst_VOP3__V_NOT_B32()
+    {
+    } // ~Inst_VOP3__V_NOT_B32
+
+    // --- description from .arch file ---
+    // D.u = ~S0.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_NOT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ~src[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BFREV_B32 class methods ---
+
+    Inst_VOP3__V_BFREV_B32::Inst_VOP3__V_BFREV_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfrev_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFREV_B32
+
+    Inst_VOP3__V_BFREV_B32::~Inst_VOP3__V_BFREV_B32()
+    {
+    } // ~Inst_VOP3__V_BFREV_B32
+
+    // --- description from .arch file ---
+    // D.u[31:0] = S0.u[0:31], bitfield reverse.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_BFREV_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = reverseBits(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FFBH_U32 class methods ---
+
+    Inst_VOP3__V_FFBH_U32::Inst_VOP3__V_FFBH_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ffbh_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_FFBH_U32
+
+    Inst_VOP3__V_FFBH_U32::~Inst_VOP3__V_FFBH_U32()
+    {
+    } // ~Inst_VOP3__V_FFBH_U32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from MSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP3__V_FFBH_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOneMsb(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FFBL_B32 class methods ---
+
+    Inst_VOP3__V_FFBL_B32::Inst_VOP3__V_FFBL_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ffbl_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_FFBL_B32
+
+    Inst_VOP3__V_FFBL_B32::~Inst_VOP3__V_FFBL_B32()
+    {
+    } // ~Inst_VOP3__V_FFBL_B32
+
+    // --- description from .arch file ---
+    // D.u = position of first 1 in S0.u from LSB;
+    // D.u = 0xffffffff if S0.u == 0.
+    void
+    Inst_VOP3__V_FFBL_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = findFirstOne(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FFBH_I32 class methods ---
+
+    Inst_VOP3__V_FFBH_I32::Inst_VOP3__V_FFBH_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ffbh_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_FFBH_I32
+
+    Inst_VOP3__V_FFBH_I32::~Inst_VOP3__V_FFBH_I32()
+    {
+    } // ~Inst_VOP3__V_FFBH_I32
+
+    // --- description from .arch file ---
+    // D.u = position of first bit different from sign bit in S0.i from MSB;
+    // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
+    void
+    Inst_VOP3__V_FFBH_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = firstOppositeSignBit(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_EXP_I32_F64 class methods ---
+
+    Inst_VOP3__V_FREXP_EXP_I32_F64::Inst_VOP3__V_FREXP_EXP_I32_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FREXP_EXP_I32_F64
+
+    Inst_VOP3__V_FREXP_EXP_I32_F64::~Inst_VOP3__V_FREXP_EXP_I32_F64()
+    {
+    } // ~Inst_VOP3__V_FREXP_EXP_I32_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_EXP_I32_F32.
+    void
+    Inst_VOP3__V_FREXP_EXP_I32_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp(0);
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_MANT_F64 class methods ---
+
+    Inst_VOP3__V_FREXP_MANT_F64::Inst_VOP3__V_FREXP_MANT_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_mant_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FREXP_MANT_F64
+
+    Inst_VOP3__V_FREXP_MANT_F64::~Inst_VOP3__V_FREXP_MANT_F64()
+    {
+    } // ~Inst_VOP3__V_FREXP_MANT_F64
+
+    // --- description from .arch file ---
+    // See V_FREXP_MANT_F32.
+    void
+    Inst_VOP3__V_FREXP_MANT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI32 exp(0);
+                vdst[lane] = std::frexp(src[lane], &exp);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FRACT_F64 class methods ---
+
+    Inst_VOP3__V_FRACT_F64::Inst_VOP3__V_FRACT_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fract_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_FRACT_F64
+
+    Inst_VOP3__V_FRACT_F64::~Inst_VOP3__V_FRACT_F64()
+    {
+    } // ~Inst_VOP3__V_FRACT_F64
+
+    // --- description from .arch file ---
+    // See V_FRACT_F32.
+    void
+    Inst_VOP3__V_FRACT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 int_part(0.0);
+                vdst[lane] = std::modf(src[lane], &int_part);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_EXP_I32_F32 class methods ---
+
+    Inst_VOP3__V_FREXP_EXP_I32_F32::Inst_VOP3__V_FREXP_EXP_I32_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FREXP_EXP_I32_F32
+
+    Inst_VOP3__V_FREXP_EXP_I32_F32::~Inst_VOP3__V_FREXP_EXP_I32_F32()
+    {
+    } // ~Inst_VOP3__V_FREXP_EXP_I32_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.i = 0;
+    // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
+    // Returns exponent of single precision float input, such that S0.f =
+    // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
+    // the significand.
+    void
+    Inst_VOP3__V_FREXP_EXP_I32_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane])|| std::isnan(src[lane])) {
+                    vdst[lane] = 0;
+                } else {
+                    VecElemI32 exp(0);
+                    std::frexp(src[lane], &exp);
+                    vdst[lane] = exp;
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_MANT_F32 class methods ---
+
+    Inst_VOP3__V_FREXP_MANT_F32::Inst_VOP3__V_FREXP_MANT_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_mant_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_FREXP_MANT_F32
+
+    Inst_VOP3__V_FREXP_MANT_F32::~Inst_VOP3__V_FREXP_MANT_F32()
+    {
+    } // ~Inst_VOP3__V_FREXP_MANT_F32
+
+    // --- description from .arch file ---
+    // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
+    // else D.f = Mantissa(S0.f).
+    // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
+    // ---  significand of single precision float input, such that S0.f =
+    // ---  significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
+    // ---  returns integer exponent.
+    void
+    Inst_VOP3__V_FREXP_MANT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isinf(src[lane]) || std::isnan(src[lane])) {
+                    vdst[lane] = src[lane];
+                } else {
+                    VecElemI32 exp(0);
+                    vdst[lane] = std::frexp(src[lane], &exp);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CLREXCP class methods ---
+
+    Inst_VOP3__V_CLREXCP::Inst_VOP3__V_CLREXCP(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_clrexcp", false)
+    {
+    } // Inst_VOP3__V_CLREXCP
+
+    Inst_VOP3__V_CLREXCP::~Inst_VOP3__V_CLREXCP()
+    {
+    } // ~Inst_VOP3__V_CLREXCP
+
+    // --- description from .arch file ---
+    // Clear wave's exception state in SIMD (SP).
+    void
+    Inst_VOP3__V_CLREXCP::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F16_U16 class methods ---
+
+    Inst_VOP3__V_CVT_F16_U16::Inst_VOP3__V_CVT_F16_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f16_u16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_F16_U16
+
+    Inst_VOP3__V_CVT_F16_U16::~Inst_VOP3__V_CVT_F16_U16()
+    {
+    } // ~Inst_VOP3__V_CVT_F16_U16
+
+    // --- description from .arch file ---
+    // D.f16 = uint16_to_flt16(S.u16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_F16_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_F16_I16 class methods ---
+
+    Inst_VOP3__V_CVT_F16_I16::Inst_VOP3__V_CVT_F16_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_f16_i16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_F16_I16
+
+    Inst_VOP3__V_CVT_F16_I16::~Inst_VOP3__V_CVT_F16_I16()
+    {
+    } // ~Inst_VOP3__V_CVT_F16_I16
+
+    // --- description from .arch file ---
+    // D.f16 = int16_to_flt16(S.i16).
+    // Supports denormals, rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_F16_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_U16_F16 class methods ---
+
+    Inst_VOP3__V_CVT_U16_F16::Inst_VOP3__V_CVT_U16_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_u16_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_U16_F16
+
+    Inst_VOP3__V_CVT_U16_F16::~Inst_VOP3__V_CVT_U16_F16()
+    {
+    } // ~Inst_VOP3__V_CVT_U16_F16
+
+    // --- description from .arch file ---
+    // D.u16 = flt16_to_uint16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_U16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_I16_F16 class methods ---
+
+    Inst_VOP3__V_CVT_I16_F16::Inst_VOP3__V_CVT_I16_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_i16_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CVT_I16_F16
+
+    Inst_VOP3__V_CVT_I16_F16::~Inst_VOP3__V_CVT_I16_F16()
+    {
+    } // ~Inst_VOP3__V_CVT_I16_F16
+
+    // --- description from .arch file ---
+    // D.i16 = flt16_to_int16(S.f16).
+    // Supports rounding, exception flags and saturation.
+    void
+    Inst_VOP3__V_CVT_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_RCP_F16 class methods ---
+
+    Inst_VOP3__V_RCP_F16::Inst_VOP3__V_RCP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rcp_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_RCP_F16
+
+    Inst_VOP3__V_RCP_F16::~Inst_VOP3__V_RCP_F16()
+    {
+    } // ~Inst_VOP3__V_RCP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecip(S0.f16).
+    void
+    Inst_VOP3__V_RCP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SQRT_F16 class methods ---
+
+    Inst_VOP3__V_SQRT_F16::Inst_VOP3__V_SQRT_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sqrt_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SQRT_F16
+
+    Inst_VOP3__V_SQRT_F16::~Inst_VOP3__V_SQRT_F16()
+    {
+    } // ~Inst_VOP3__V_SQRT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateSqrt(S0.f16).
+    void
+    Inst_VOP3__V_SQRT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_RSQ_F16 class methods ---
+
+    Inst_VOP3__V_RSQ_F16::Inst_VOP3__V_RSQ_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rsq_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_RSQ_F16
+
+    Inst_VOP3__V_RSQ_F16::~Inst_VOP3__V_RSQ_F16()
+    {
+    } // ~Inst_VOP3__V_RSQ_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = ApproximateRecipSqrt(S0.f16).
+    void
+    Inst_VOP3__V_RSQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_LOG_F16 class methods ---
+
+    Inst_VOP3__V_LOG_F16::Inst_VOP3__V_LOG_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_log_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_LOG_F16
+
+    Inst_VOP3__V_LOG_F16::~Inst_VOP3__V_LOG_F16()
+    {
+    } // ~Inst_VOP3__V_LOG_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 1.0f)
+    //     D.f16 = 0.0f;
+    // else
+    //     D.f16 = ApproximateLog2(S0.f16).
+    void
+    Inst_VOP3__V_LOG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_EXP_F16 class methods ---
+
+    Inst_VOP3__V_EXP_F16::Inst_VOP3__V_EXP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_exp_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_EXP_F16
+
+    Inst_VOP3__V_EXP_F16::~Inst_VOP3__V_EXP_F16()
+    {
+    } // ~Inst_VOP3__V_EXP_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == 0.0f)
+    //     D.f16 = 1.0f;
+    // else
+    //     D.f16 = Approximate2ToX(S0.f16).
+    void
+    Inst_VOP3__V_EXP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_MANT_F16 class methods ---
+
+    Inst_VOP3__V_FREXP_MANT_F16::Inst_VOP3__V_FREXP_MANT_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_mant_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FREXP_MANT_F16
+
+    Inst_VOP3__V_FREXP_MANT_F16::~Inst_VOP3__V_FREXP_MANT_F16()
+    {
+    } // ~Inst_VOP3__V_FREXP_MANT_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.f16 = S0.f16;
+    // else
+    //     D.f16 = mantissa(S0.f16).
+    // Result range is (-1.0,-0.5][0.5,1.0).
+    // C math library frexp function.
+    // Returns binary significand of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP3__V_FREXP_MANT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FREXP_EXP_I16_F16 class methods ---
+
+    Inst_VOP3__V_FREXP_EXP_I16_F16::Inst_VOP3__V_FREXP_EXP_I16_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_frexp_exp_i16_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FREXP_EXP_I16_F16
+
+    Inst_VOP3__V_FREXP_EXP_I16_F16::~Inst_VOP3__V_FREXP_EXP_I16_F16()
+    {
+    } // ~Inst_VOP3__V_FREXP_EXP_I16_F16
+
+    // --- description from .arch file ---
+    // if (S0.f16 == +-INF || S0.f16 == NAN)
+    //     D.i16 = 0;
+    // else
+    //     D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
+    // C math library frexp function.
+    // Returns exponent of half precision float input, such that the
+    // original single float = significand * (2 ** exponent).
+    void
+    Inst_VOP3__V_FREXP_EXP_I16_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FLOOR_F16 class methods ---
+
+    Inst_VOP3__V_FLOOR_F16::Inst_VOP3__V_FLOOR_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_floor_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FLOOR_F16
+
+    Inst_VOP3__V_FLOOR_F16::~Inst_VOP3__V_FLOOR_F16()
+    {
+    } // ~Inst_VOP3__V_FLOOR_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
+    void
+    Inst_VOP3__V_FLOOR_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CEIL_F16 class methods ---
+
+    Inst_VOP3__V_CEIL_F16::Inst_VOP3__V_CEIL_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ceil_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CEIL_F16
+
+    Inst_VOP3__V_CEIL_F16::~Inst_VOP3__V_CEIL_F16()
+    {
+    } // ~Inst_VOP3__V_CEIL_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16);
+    // if (S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
+    void
+    Inst_VOP3__V_CEIL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_TRUNC_F16 class methods ---
+
+    Inst_VOP3__V_TRUNC_F16::Inst_VOP3__V_TRUNC_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trunc_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_TRUNC_F16
+
+    Inst_VOP3__V_TRUNC_F16::~Inst_VOP3__V_TRUNC_F16()
+    {
+    } // ~Inst_VOP3__V_TRUNC_F16
+
+    // --- description from .arch file ---
+    // D.f16 = trunc(S0.f16).
+    // Round-to-zero semantics.
+    void
+    Inst_VOP3__V_TRUNC_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_RNDNE_F16 class methods ---
+
+    Inst_VOP3__V_RNDNE_F16::Inst_VOP3__V_RNDNE_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_rndne_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_RNDNE_F16
+
+    Inst_VOP3__V_RNDNE_F16::~Inst_VOP3__V_RNDNE_F16()
+    {
+    } // ~Inst_VOP3__V_RNDNE_F16
+
+    // --- description from .arch file ---
+    // D.f16 = FLOOR(S0.f16 + 0.5f);
+    // if (floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
+    // Round-to-nearest-even semantics.
+    void
+    Inst_VOP3__V_RNDNE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_FRACT_F16 class methods ---
+
+    Inst_VOP3__V_FRACT_F16::Inst_VOP3__V_FRACT_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fract_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_FRACT_F16
+
+    Inst_VOP3__V_FRACT_F16::~Inst_VOP3__V_FRACT_F16()
+    {
+    } // ~Inst_VOP3__V_FRACT_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 + -floor(S0.f16).
+    void
+    Inst_VOP3__V_FRACT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_SIN_F16 class methods ---
+
+    Inst_VOP3__V_SIN_F16::Inst_VOP3__V_SIN_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sin_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_SIN_F16
+
+    Inst_VOP3__V_SIN_F16::~Inst_VOP3__V_SIN_F16()
+    {
+    } // ~Inst_VOP3__V_SIN_F16
+
+    // --- description from .arch file ---
+    // D.f16 = sin(S0.f16 * 2 * PI).
+    void
+    Inst_VOP3__V_SIN_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_COS_F16 class methods ---
+
+    Inst_VOP3__V_COS_F16::Inst_VOP3__V_COS_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cos_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_COS_F16
+
+    Inst_VOP3__V_COS_F16::~Inst_VOP3__V_COS_F16()
+    {
+    } // ~Inst_VOP3__V_COS_F16
+
+    // --- description from .arch file ---
+    // D.f16 = cos(S0.f16 * 2 * PI).
+    void
+    Inst_VOP3__V_COS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_EXP_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_EXP_LEGACY_F32::Inst_VOP3__V_EXP_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_exp_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_EXP_LEGACY_F32
+
+    Inst_VOP3__V_EXP_LEGACY_F32::~Inst_VOP3__V_EXP_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_EXP_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(2.0, S0.f) with legacy semantics.
+    void
+    Inst_VOP3__V_EXP_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::pow(2.0, src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LOG_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_LOG_LEGACY_F32::Inst_VOP3__V_LOG_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_log_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_LOG_LEGACY_F32
+
+    Inst_VOP3__V_LOG_LEGACY_F32::~Inst_VOP3__V_LOG_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_LOG_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
+    void
+    Inst_VOP3__V_LOG_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::log2(src[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_LEGACY_F32 class methods ---
+
+    Inst_VOP3__V_MAD_LEGACY_F32::Inst_VOP3__V_MAD_LEGACY_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_legacy_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_LEGACY_F32
+
+    Inst_VOP3__V_MAD_LEGACY_F32::~Inst_VOP3__V_MAD_LEGACY_F32()
+    {
+    } // ~Inst_VOP3__V_MAD_LEGACY_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + S2.f (DX9 rules, 0.0 * x = 0.0).
+    void
+    Inst_VOP3__V_MAD_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_F32 class methods ---
+
+    Inst_VOP3__V_MAD_F32::Inst_VOP3__V_MAD_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_F32
+
+    Inst_VOP3__V_MAD_F32::~Inst_VOP3__V_MAD_F32()
+    {
+    } // ~Inst_VOP3__V_MAD_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + S2.f.
+    void
+    Inst_VOP3__V_MAD_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_I32_I24 class methods ---
+
+    Inst_VOP3__V_MAD_I32_I24::Inst_VOP3__V_MAD_I32_I24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_i32_i24", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_I32_I24
+
+    Inst_VOP3__V_MAD_I32_I24::~Inst_VOP3__V_MAD_I32_I24()
+    {
+    } // ~Inst_VOP3__V_MAD_I32_I24
+
+    // --- description from .arch file ---
+    // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
+    void
+    Inst_VOP3__V_MAD_I32_I24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
+                    * sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_U32_U24 class methods ---
+
+    Inst_VOP3__V_MAD_U32_U24::Inst_VOP3__V_MAD_U32_U24(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_u32_u24", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_U32_U24
+
+    Inst_VOP3__V_MAD_U32_U24::~Inst_VOP3__V_MAD_U32_U24()
+    {
+    } // ~Inst_VOP3__V_MAD_U32_U24
+
+    // --- description from .arch file ---
+    // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
+    void
+    Inst_VOP3__V_MAD_U32_U24::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
+                    + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CUBEID_F32 class methods ---
+
+    Inst_VOP3__V_CUBEID_F32::Inst_VOP3__V_CUBEID_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubeid_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBEID_F32
+
+    Inst_VOP3__V_CUBEID_F32::~Inst_VOP3__V_CUBEID_F32()
+    {
+    } // ~Inst_VOP3__V_CUBEID_F32
+
+    // --- description from .arch file ---
+    // D.f = cubemap face ID ({0.0, 1.0, ..., 5.0}). XYZ coordinate is given in
+    // ---  (S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_CUBEID_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CUBESC_F32 class methods ---
+
+    Inst_VOP3__V_CUBESC_F32::Inst_VOP3__V_CUBESC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubesc_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBESC_F32
+
+    Inst_VOP3__V_CUBESC_F32::~Inst_VOP3__V_CUBESC_F32()
+    {
+    } // ~Inst_VOP3__V_CUBESC_F32
+
+    // --- description from .arch file ---
+    // D.f = cubemap S coordinate. XYZ coordinate is given in (S0.f, S1.f,
+    // S2.f).
+    void
+    Inst_VOP3__V_CUBESC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CUBETC_F32 class methods ---
+
+    Inst_VOP3__V_CUBETC_F32::Inst_VOP3__V_CUBETC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubetc_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBETC_F32
+
+    Inst_VOP3__V_CUBETC_F32::~Inst_VOP3__V_CUBETC_F32()
+    {
+    } // ~Inst_VOP3__V_CUBETC_F32
+
+    // --- description from .arch file ---
+    // D.f = cubemap T coordinate. XYZ coordinate is given in (S0.f, S1.f,
+    // S2.f).
+    void
+    Inst_VOP3__V_CUBETC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CUBEMA_F32 class methods ---
+
+    Inst_VOP3__V_CUBEMA_F32::Inst_VOP3__V_CUBEMA_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cubema_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CUBEMA_F32
+
+    Inst_VOP3__V_CUBEMA_F32::~Inst_VOP3__V_CUBEMA_F32()
+    {
+    } // ~Inst_VOP3__V_CUBEMA_F32
+
+    // --- description from .arch file ---
+    // D.f = 2.0 * cubemap major axis. XYZ coordinate is given in (S0.f, S1.f,
+    // ---  S2.f).
+    void
+    Inst_VOP3__V_CUBEMA_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_BFE_U32 class methods ---
+
+    Inst_VOP3__V_BFE_U32::Inst_VOP3__V_BFE_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfe_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFE_U32
+
+    Inst_VOP3__V_BFE_U32::~Inst_VOP3__V_BFE_U32()
+    {
+    } // ~Inst_VOP3__V_BFE_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
+    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
+    void
+    Inst_VOP3__V_BFE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
+                    & ((1 << bits(src2[lane], 4, 0)) - 1);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BFE_I32 class methods ---
+
+    Inst_VOP3__V_BFE_I32::Inst_VOP3__V_BFE_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfe_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFE_I32
+
+    Inst_VOP3__V_BFE_I32::~Inst_VOP3__V_BFE_I32()
+    {
+    } // ~Inst_VOP3__V_BFE_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
+    // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
+    void
+    Inst_VOP3__V_BFE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
+                    & ((1 << bits(src2[lane], 4, 0)) - 1);
+
+                // Above extracted a signed int of size src2 bits which needs
+                // to be signed-extended. Check if the MSB of our src2-bit
+                // integer is 1, and sign extend it is.
+                if (vdst[lane] >> (bits(src2[lane], 4, 0) - 1)) {
+                    vdst[lane] |= 0xffffffff << bits(src2[lane], 4, 0);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BFI_B32 class methods ---
+
+    Inst_VOP3__V_BFI_B32::Inst_VOP3__V_BFI_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfi_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFI_B32
+
+    Inst_VOP3__V_BFI_B32::~Inst_VOP3__V_BFI_B32()
+    {
+    } // ~Inst_VOP3__V_BFI_B32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
+    void
+    Inst_VOP3__V_BFI_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
+                    & src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FMA_F32 class methods ---
+
+    Inst_VOP3__V_FMA_F32::Inst_VOP3__V_FMA_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fma_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMA_F32
+
+    Inst_VOP3__V_FMA_F32::~Inst_VOP3__V_FMA_F32()
+    {
+    } // ~Inst_VOP3__V_FMA_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + S2.f.
+    void
+    Inst_VOP3__V_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FMA_F64 class methods ---
+
+    Inst_VOP3__V_FMA_F64::Inst_VOP3__V_FMA_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fma_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMA_F64
+
+    Inst_VOP3__V_FMA_F64::~Inst_VOP3__V_FMA_F64()
+    {
+    } // ~Inst_VOP3__V_FMA_F64
+
+    // --- description from .arch file ---
+    // D.d = S0.d * S1.d + S2.d.
+    void
+    Inst_VOP3__V_FMA_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LERP_U8 class methods ---
+
+    Inst_VOP3__V_LERP_U8::Inst_VOP3__V_LERP_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lerp_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LERP_U8
+
+    Inst_VOP3__V_LERP_U8::~Inst_VOP3__V_LERP_U8()
+    {
+    } // ~Inst_VOP3__V_LERP_U8
+
+    // --- description from .arch file ---
+    // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
+    // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
+    // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
+    // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
+    // Unsigned 8-bit pixel average on packed unsigned bytes (linear
+    // ---  interpolation). S2 acts as a round mode; if set, 0.5 rounds up,
+    // ---  otherwise 0.5 truncates.
+    void
+    Inst_VOP3__V_LERP_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ((bits(src0[lane], 31, 24)
+                    + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
+                        << 24;
+                vdst[lane] += ((bits(src0[lane], 23, 16)
+                    + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
+                        << 16;
+                vdst[lane] += ((bits(src0[lane], 15, 8)
+                    + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
+                        << 8;
+                vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
+                    + bits(src2[lane], 0)) >> 1);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ALIGNBIT_B32 class methods ---
+
+    Inst_VOP3__V_ALIGNBIT_B32::Inst_VOP3__V_ALIGNBIT_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_alignbit_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ALIGNBIT_B32
+
+    Inst_VOP3__V_ALIGNBIT_B32::~Inst_VOP3__V_ALIGNBIT_B32()
+    {
+    } // ~Inst_VOP3__V_ALIGNBIT_B32
+
+    // --- description from .arch file ---
+    // D.u = ({S0,S1} >> S2.u[4:0]) & 0xffffffff.
+    void
+    Inst_VOP3__V_ALIGNBIT_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
+                    | (VecElemU64)src1[lane]);
+                vdst[lane] = (VecElemU32)((src_0_1
+                    >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ALIGNBYTE_B32 class methods ---
+
+    Inst_VOP3__V_ALIGNBYTE_B32::Inst_VOP3__V_ALIGNBYTE_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_alignbyte_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ALIGNBYTE_B32
+
+    Inst_VOP3__V_ALIGNBYTE_B32::~Inst_VOP3__V_ALIGNBYTE_B32()
+    {
+    } // ~Inst_VOP3__V_ALIGNBYTE_B32
+
+    // --- description from .arch file ---
+    // D.u = ({S0,S1} >> (8*S2.u[4:0])) & 0xffffffff.
+    void
+    Inst_VOP3__V_ALIGNBYTE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
+                    | (VecElemU64)src1[lane]);
+                vdst[lane] = (VecElemU32)((src_0_1
+                    >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
+                        & 0xffffffff);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN3_F32 class methods ---
+
+    Inst_VOP3__V_MIN3_F32::Inst_VOP3__V_MIN3_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min3_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MIN3_F32
+
+    Inst_VOP3__V_MIN3_F32::~Inst_VOP3__V_MIN3_F32()
+    {
+    } // ~Inst_VOP3__V_MIN3_F32
+
+    // --- description from .arch file ---
+    // D.f = min(S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_MIN3_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
+                vdst[lane] = std::fmin(min_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN3_I32 class methods ---
+
+    Inst_VOP3__V_MIN3_I32::Inst_VOP3__V_MIN3_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min3_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN3_I32
+
+    Inst_VOP3__V_MIN3_I32::~Inst_VOP3__V_MIN3_I32()
+    {
+    } // ~Inst_VOP3__V_MIN3_I32
+
+    // --- description from .arch file ---
+    // D.i = min(S0.i, S1.i, S2.i).
+    void
+    Inst_VOP3__V_MIN3_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
+                vdst[lane] = std::min(min_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN3_U32 class methods ---
+
+    Inst_VOP3__V_MIN3_U32::Inst_VOP3__V_MIN3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MIN3_U32
+
+    Inst_VOP3__V_MIN3_U32::~Inst_VOP3__V_MIN3_U32()
+    {
+    } // ~Inst_VOP3__V_MIN3_U32
+
+    // --- description from .arch file ---
+    // D.u = min(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MIN3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
+                vdst[lane] = std::min(min_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX3_F32 class methods ---
+
+    Inst_VOP3__V_MAX3_F32::Inst_VOP3__V_MAX3_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max3_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MAX3_F32
+
+    Inst_VOP3__V_MAX3_F32::~Inst_VOP3__V_MAX3_F32()
+    {
+    } // ~Inst_VOP3__V_MAX3_F32
+
+    // --- description from .arch file ---
+    // D.f = max(S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_MAX3_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
+                vdst[lane] = std::fmax(max_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX3_I32 class methods ---
+
+    Inst_VOP3__V_MAX3_I32::Inst_VOP3__V_MAX3_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max3_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX3_I32
+
+    Inst_VOP3__V_MAX3_I32::~Inst_VOP3__V_MAX3_I32()
+    {
+    } // ~Inst_VOP3__V_MAX3_I32
+
+    // --- description from .arch file ---
+    // D.i = max(S0.i, S1.i, S2.i).
+    void
+    Inst_VOP3__V_MAX3_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
+                vdst[lane] = std::max(max_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX3_U32 class methods ---
+
+    Inst_VOP3__V_MAX3_U32::Inst_VOP3__V_MAX3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MAX3_U32
+
+    Inst_VOP3__V_MAX3_U32::~Inst_VOP3__V_MAX3_U32()
+    {
+    } // ~Inst_VOP3__V_MAX3_U32
+
+    // --- description from .arch file ---
+    // D.u = max(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MAX3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
+                vdst[lane] = std::max(max_0_1, src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MED3_F32 class methods ---
+
+    Inst_VOP3__V_MED3_F32::Inst_VOP3__V_MED3_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_med3_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_MED3_F32
+
+    Inst_VOP3__V_MED3_F32::~Inst_VOP3__V_MED3_F32()
+    {
+    } // ~Inst_VOP3__V_MED3_F32
+
+    // --- description from .arch file ---
+    // D.f = median(S0.f, S1.f, S2.f).
+    void
+    Inst_VOP3__V_MED3_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MED3_I32 class methods ---
+
+    Inst_VOP3__V_MED3_I32::Inst_VOP3__V_MED3_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_med3_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MED3_I32
+
+    Inst_VOP3__V_MED3_I32::~Inst_VOP3__V_MED3_I32()
+    {
+    } // ~Inst_VOP3__V_MED3_I32
+
+    // --- description from .arch file ---
+    // D.i = median(S0.i, S1.i, S2.i).
+    void
+    Inst_VOP3__V_MED3_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MED3_U32 class methods ---
+
+    Inst_VOP3__V_MED3_U32::Inst_VOP3__V_MED3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_med3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MED3_U32
+
+    Inst_VOP3__V_MED3_U32::~Inst_VOP3__V_MED3_U32()
+    {
+    } // ~Inst_VOP3__V_MED3_U32
+
+    // --- description from .arch file ---
+    // D.u = median(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MED3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_U8 class methods ---
+
+    Inst_VOP3__V_SAD_U8::Inst_VOP3__V_SAD_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_U8
+
+    Inst_VOP3__V_SAD_U8::~Inst_VOP3__V_SAD_U8()
+    {
+    } // ~Inst_VOP3__V_SAD_U8
+
+    // --- description from .arch file ---
+    // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
+    // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
+    // Sum of absolute differences with accumulation, overflow into upper bits
+    // is allowed.
+    void
+    Inst_VOP3__V_SAD_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::abs(bits(src0[lane], 31, 24)
+                    - bits(src1[lane], 31, 24))
+                    + std::abs(bits(src0[lane], 23, 16)
+                    - bits(src1[lane], 23, 16))
+                    + std::abs(bits(src0[lane], 15, 8)
+                    - bits(src1[lane], 15, 8))
+                    + std::abs(bits(src0[lane], 7, 0)
+                    - bits(src1[lane], 7, 0)) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_HI_U8 class methods ---
+
+    Inst_VOP3__V_SAD_HI_U8::Inst_VOP3__V_SAD_HI_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_hi_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_HI_U8
+
+    Inst_VOP3__V_SAD_HI_U8::~Inst_VOP3__V_SAD_HI_U8()
+    {
+    } // ~Inst_VOP3__V_SAD_HI_U8
+
+    // --- description from .arch file ---
+    // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
+    // Sum of absolute differences with accumulation, overflow is lost.
+    void
+    Inst_VOP3__V_SAD_HI_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (((bits(src0[lane], 31, 24)
+                    - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
+                    - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
+                    - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
+                    - bits(src1[lane], 7, 0))) << 16) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_U16 class methods ---
+
+    Inst_VOP3__V_SAD_U16::Inst_VOP3__V_SAD_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_u16", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_U16
+
+    Inst_VOP3__V_SAD_U16::~Inst_VOP3__V_SAD_U16()
+    {
+    } // ~Inst_VOP3__V_SAD_U16
+
+    // --- description from .arch file ---
+    // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
+    // + S2.u.
+    // Word SAD with accumulation.
+    void
+    Inst_VOP3__V_SAD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::abs(bits(src0[lane], 31, 16)
+                    - bits(src1[lane], 31, 16))
+                    + std::abs(bits(src0[lane], 15, 0)
+                    - bits(src1[lane], 15, 0)) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SAD_U32 class methods ---
+
+    Inst_VOP3__V_SAD_U32::Inst_VOP3__V_SAD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sad_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SAD_U32
+
+    Inst_VOP3__V_SAD_U32::~Inst_VOP3__V_SAD_U32()
+    {
+    } // ~Inst_VOP3__V_SAD_U32
+
+    // --- description from .arch file ---
+    // D.u = abs(S0.i - S1.i) + S2.u.
+    // Dword SAD with accumulation.
+    void
+    Inst_VOP3__V_SAD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
+            } // if
+        } // for
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PK_U8_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_U8_F32::Inst_VOP3__V_CVT_PK_U8_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_u8_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PK_U8_F32
+
+    Inst_VOP3__V_CVT_PK_U8_F32::~Inst_VOP3__V_CVT_PK_U8_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_U8_F32
+
+    // --- description from .arch file ---
+    // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
+    // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
+    // Convert floating point value S0 to 8-bit unsigned integer and pack the
+    // result into byte S1 of dword S2.
+    void
+    Inst_VOP3__V_CVT_PK_U8_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
+                    << (8 * bits(src1[lane], 1, 0)))
+                    | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FIXUP_F32 class methods ---
+
+    Inst_VOP3__V_DIV_FIXUP_F32::Inst_VOP3__V_DIV_FIXUP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fixup_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_DIV_FIXUP_F32
+
+    Inst_VOP3__V_DIV_FIXUP_F32::~Inst_VOP3__V_DIV_FIXUP_F32()
+    {
+    } // ~Inst_VOP3__V_DIV_FIXUP_F32
+
+    // --- description from .arch file ---
+    // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
+    // s2.f = Numerator. This opcode generates exceptions resulting from the
+    // division operation.
+    void
+    Inst_VOP3__V_DIV_FIXUP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::fpclassify(src1[lane]) == FP_ZERO) {
+                    if (std::signbit(src1[lane])) {
+                        vdst[lane] = -INFINITY;
+                    } else {
+                        vdst[lane] = +INFINITY;
+                    }
+                } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if (std::isinf(src1[lane])) {
+                    if (std::signbit(src1[lane])) {
+                        vdst[lane] = -INFINITY;
+                    } else {
+                        vdst[lane] = +INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src2[lane] / src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---
+
+    Inst_VOP3__V_DIV_FIXUP_F64::Inst_VOP3__V_DIV_FIXUP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fixup_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_DIV_FIXUP_F64
+
+    Inst_VOP3__V_DIV_FIXUP_F64::~Inst_VOP3__V_DIV_FIXUP_F64()
+    {
+    } // ~Inst_VOP3__V_DIV_FIXUP_F64
+
+    // --- description from .arch file ---
+    // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
+    // s2.d = Numerator. This opcode generates exceptions resulting from the
+    // division operation.
+    void
+    Inst_VOP3__V_DIV_FIXUP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int sign_out = std::signbit(src1[lane])
+                              ^ std::signbit(src2[lane]);
+                int exp1(0);
+                int exp2(0);
+                std::frexp(src1[lane], &exp1);
+                std::frexp(src2[lane], &exp2);
+
+                if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
+                    vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
+                } else if (std::fpclassify(src1[lane]) == FP_ZERO
+                           && std::fpclassify(src2[lane]) == FP_ZERO) {
+                    vdst[lane]
+                        = std::numeric_limits<VecElemF64>::signaling_NaN();
+                } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
+                    vdst[lane]
+                        = std::numeric_limits<VecElemF64>::signaling_NaN();
+                } else if (std::fpclassify(src1[lane]) == FP_ZERO
+                           || std::isinf(src2[lane])) {
+                    vdst[lane] = sign_out ? -INFINITY : +INFINITY;
+                } else if (std::isinf(src1[lane])
+                           || std::fpclassify(src2[lane]) == FP_ZERO) {
+                    vdst[lane] = sign_out ? -0.0 : +0.0;
+                } else if (exp2 - exp1 < -1075) {
+                    vdst[lane] = src0[lane];
+                } else if (exp1 == 2047) {
+                    vdst[lane] = src0[lane];
+                } else {
+                    vdst[lane] = sign_out ? -std::fabs(src0[lane])
+                        : std::fabs(src0[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_SCALE_F32 class methods ---
+
+    Inst_VOP3__V_DIV_SCALE_F32::Inst_VOP3__V_DIV_SCALE_F32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_div_scale_f32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(F32);
+    } // Inst_VOP3__V_DIV_SCALE_F32
+
+    Inst_VOP3__V_DIV_SCALE_F32::~Inst_VOP3__V_DIV_SCALE_F32()
+    {
+    } // ~Inst_VOP3__V_DIV_SCALE_F32
+
+    // --- description from .arch file ---
+    // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
+    // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
+    // numerator and denominator, this opcode will appropriately scale inputs
+    // for division to avoid subnormal terms during Newton-Raphson correction
+    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
+    void
+    Inst_VOP3__V_DIV_SCALE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane];
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---
+
+    Inst_VOP3__V_DIV_SCALE_F64::Inst_VOP3__V_DIV_SCALE_F64(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_div_scale_f64")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(F64);
+    } // Inst_VOP3__V_DIV_SCALE_F64
+
+    Inst_VOP3__V_DIV_SCALE_F64::~Inst_VOP3__V_DIV_SCALE_F64()
+    {
+    } // ~Inst_VOP3__V_DIV_SCALE_F64
+
+    // --- description from .arch file ---
+    // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
+    // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
+    // numerator and denominator, this opcode will appropriately scale inputs
+    // for division to avoid subnormal terms during Newton-Raphson correction
+    // algorithm. This opcode producses a VCC flag for post-scale of quotient.
+    void
+    Inst_VOP3__V_DIV_SCALE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int exp1(0);
+                int exp2(0);
+                std::frexp(src1[lane], &exp1);
+                std::frexp(src2[lane], &exp2);
+                vcc.setBit(lane, 0);
+
+                if (std::fpclassify(src1[lane]) == FP_ZERO
+                    || std::fpclassify(src2[lane]) == FP_ZERO) {
+                    vdst[lane] = NAN;
+                } else if (exp2 - exp1 >= 768) {
+                    vcc.setBit(lane, 1);
+                    if (src0[lane] == src1[lane]) {
+                        vdst[lane] = std::ldexp(src0[lane], 128);
+                    }
+                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
+                    vdst[lane] = std::ldexp(src0[lane], 128);
+                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
+                           && std::fpclassify(src2[lane] / src1[lane])
+                           == FP_SUBNORMAL) {
+                    vcc.setBit(lane, 1);
+                    if (src0[lane] == src1[lane]) {
+                        vdst[lane] = std::ldexp(src0[lane], 128);
+                    }
+                } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
+                    vdst[lane] = std::ldexp(src0[lane], -128);
+                } else if (std::fpclassify(src2[lane] / src1[lane])
+                           == FP_SUBNORMAL) {
+                    vcc.setBit(lane, 1);
+                    if (src0[lane] == src2[lane]) {
+                        vdst[lane] = std::ldexp(src0[lane], 128);
+                    }
+                } else if (exp2 <= 53) {
+                    vdst[lane] = std::ldexp(src0[lane], 128);
+                }
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FMAS_F32 class methods ---
+
+    Inst_VOP3__V_DIV_FMAS_F32::Inst_VOP3__V_DIV_FMAS_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fmas_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+        setFlag(F32);
+        setFlag(FMA);
+    } // Inst_VOP3__V_DIV_FMAS_F32
+
+    Inst_VOP3__V_DIV_FMAS_F32::~Inst_VOP3__V_DIV_FMAS_F32()
+    {
+    } // ~Inst_VOP3__V_DIV_FMAS_F32
+
+    // --- description from .arch file ---
+    // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
+    // s1.f = Denominator, s2.f = Numerator)
+    void
+    Inst_VOP3__V_DIV_FMAS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+            }
+        }
+
+        //vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---
+
+    Inst_VOP3__V_DIV_FMAS_F64::Inst_VOP3__V_DIV_FMAS_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fmas_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(ReadsVCC);
+        setFlag(F64);
+        setFlag(FMA);
+    } // Inst_VOP3__V_DIV_FMAS_F64
+
+    Inst_VOP3__V_DIV_FMAS_F64::~Inst_VOP3__V_DIV_FMAS_F64()
+    {
+    } // ~Inst_VOP3__V_DIV_FMAS_F64
+
+    // --- description from .arch file ---
+    // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
+    // s1.d = Denominator, s2.d = Numerator)
+    void
+    Inst_VOP3__V_DIV_FMAS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+        vcc.read();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            src2.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            src2.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(vcc.rawData(), lane)) {
+                    vdst[lane] = std::pow(2, 64)
+                        * std::fma(src0[lane], src1[lane], src2[lane]);
+                } else {
+                    vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MSAD_U8 class methods ---
+
+    Inst_VOP3__V_MSAD_U8::Inst_VOP3__V_MSAD_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_msad_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MSAD_U8
+
+    Inst_VOP3__V_MSAD_U8::~Inst_VOP3__V_MSAD_U8()
+    {
+    } // ~Inst_VOP3__V_MSAD_U8
+
+    // --- description from .arch file ---
+    // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
+    void
+    Inst_VOP3__V_MSAD_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_QSAD_PK_U16_U8 class methods ---
+
+    Inst_VOP3__V_QSAD_PK_U16_U8::Inst_VOP3__V_QSAD_PK_U16_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_qsad_pk_u16_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_QSAD_PK_U16_U8
+
+    Inst_VOP3__V_QSAD_PK_U16_U8::~Inst_VOP3__V_QSAD_PK_U16_U8()
+    {
+    } // ~Inst_VOP3__V_QSAD_PK_U16_U8
+
+    // --- description from .arch file ---
+    // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
+    // S1.u[31:0], S2.u[63:0])
+    void
+    Inst_VOP3__V_QSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MQSAD_PK_U16_U8 class methods ---
+
+    Inst_VOP3__V_MQSAD_PK_U16_U8::Inst_VOP3__V_MQSAD_PK_U16_U8(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mqsad_pk_u16_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MQSAD_PK_U16_U8
+
+    Inst_VOP3__V_MQSAD_PK_U16_U8::~Inst_VOP3__V_MQSAD_PK_U16_U8()
+    {
+    } // ~Inst_VOP3__V_MQSAD_PK_U16_U8
+
+    // --- description from .arch file ---
+    // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
+    // ---  S1.u[31:0], S2.u[63:0])
+    void
+    Inst_VOP3__V_MQSAD_PK_U16_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MQSAD_U32_U8 class methods ---
+
+    Inst_VOP3__V_MQSAD_U32_U8::Inst_VOP3__V_MQSAD_U32_U8(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mqsad_u32_u8", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MQSAD_U32_U8
+
+    Inst_VOP3__V_MQSAD_U32_U8::~Inst_VOP3__V_MQSAD_U32_U8()
+    {
+    } // ~Inst_VOP3__V_MQSAD_U32_U8
+
+    // --- description from .arch file ---
+    // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
+    // ---  S1.u[31:0], S2.u[127:0])
+    void
+    Inst_VOP3__V_MQSAD_U32_U8::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAD_U64_U32 class methods ---
+
+    Inst_VOP3__V_MAD_U64_U32::Inst_VOP3__V_MAD_U64_U32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_mad_u64_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_U64_U32
+
+    Inst_VOP3__V_MAD_U64_U32::~Inst_VOP3__V_MAD_U64_U32()
+    {
+    } // ~Inst_VOP3__V_MAD_U64_U32
+
+    // --- description from .arch file ---
+    // {vcc_out,D.u64} = S0.u32 * S1.u32 + S2.u64.
+    void
+    Inst_VOP3__V_MAD_U64_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+        vdst.read();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
+                    src2[lane]));
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_I64_I32 class methods ---
+
+    Inst_VOP3__V_MAD_I64_I32::Inst_VOP3__V_MAD_I64_I32(
+          InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_mad_i64_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesVCC);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_I64_I32
+
+    Inst_VOP3__V_MAD_I64_I32::~Inst_VOP3__V_MAD_I64_I32()
+    {
+    } // ~Inst_VOP3__V_MAD_I64_I32
+
+    // --- description from .arch file ---
+    // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
+    void
+    Inst_VOP3__V_MAD_I64_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
+        ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
+        VecOperandI64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
+                    src2[lane]));
+            }
+        }
+
+        vcc.write();
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_XAD_U32 class methods ---
+
+    Inst_VOP3__V_XAD_U32::Inst_VOP3__V_XAD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_xad_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_XAD_U32
+
+    Inst_VOP3__V_XAD_U32::~Inst_VOP3__V_XAD_U32()
+    {
+    } // ~Inst_VOP3__V_XAD_U32
+
+    // --- description from .arch file ---
+    // D.u32 = (S0.u32 ^ S1.u32) + S2.u32.
+    void
+    Inst_VOP3__V_XAD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] ^ src1[lane]) + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHL_ADD_U32 class methods ---
+
+    Inst_VOP3__V_LSHL_ADD_U32::Inst_VOP3__V_LSHL_ADD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshl_add_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHL_ADD_U32
+
+    Inst_VOP3__V_LSHL_ADD_U32::~Inst_VOP3__V_LSHL_ADD_U32()
+    {
+    } // ~Inst_VOP3__V_LSHL_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u << S1.u[4:0]) + S2.u.
+    void
+    Inst_VOP3__V_LSHL_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
+                           + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD_LSHL_U32 class methods ---
+
+    Inst_VOP3__V_ADD_LSHL_U32::Inst_VOP3__V_ADD_LSHL_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_lshl_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD_LSHL_U32
+
+    Inst_VOP3__V_ADD_LSHL_U32::~Inst_VOP3__V_ADD_LSHL_U32()
+    {
+    } // ~Inst_VOP3__V_ADD_LSHL_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u + S1.u) << S2.u[4:0].
+    void
+    Inst_VOP3__V_ADD_LSHL_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] =
+                    (src0[lane] + src1[lane]) << bits(src2[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ADD3_U32 class methods ---
+
+    Inst_VOP3__V_ADD3_U32::Inst_VOP3__V_ADD3_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add3_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD3_U32
+
+    Inst_VOP3__V_ADD3_U32::~Inst_VOP3__V_ADD3_U32()
+    {
+    } // ~Inst_VOP3__V_ADD3_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u + S2.u.
+    void
+    Inst_VOP3__V_ADD3_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane] + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHL_OR_B32 class methods ---
+
+    Inst_VOP3__V_LSHL_OR_B32::Inst_VOP3__V_LSHL_OR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshl_or_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHL_OR_B32
+
+    Inst_VOP3__V_LSHL_OR_B32::~Inst_VOP3__V_LSHL_OR_B32()
+    {
+    } // ~Inst_VOP3__V_LSHL_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u << S1.u[4:0]) | S2.u.
+    void
+    Inst_VOP3__V_LSHL_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
+                           | src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_AND_OR_B32 class methods ---
+
+    Inst_VOP3__V_AND_OR_B32::Inst_VOP3__V_AND_OR_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_and_or_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_AND_OR_B32
+
+    Inst_VOP3__V_AND_OR_B32::~Inst_VOP3__V_AND_OR_B32()
+    {
+    } // ~Inst_VOP3__V_AND_OR_B32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u & S1.u) | S2.u.
+    // Input and output modifiers not supported.
+    void
+    Inst_VOP3__V_AND_OR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = (src0[lane] & src1[lane]) | src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_F16 class methods ---
+
+    Inst_VOP3__V_MAD_F16::Inst_VOP3__V_MAD_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_F16
+
+    Inst_VOP3__V_MAD_F16::~Inst_VOP3__V_MAD_F16()
+    {
+    } // ~Inst_VOP3__V_MAD_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + S2.f16.
+    // Supports round mode, exception flags, saturation.
+    void
+    Inst_VOP3__V_MAD_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_MAD_U16 class methods ---
+
+    Inst_VOP3__V_MAD_U16::Inst_VOP3__V_MAD_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_u16", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_U16
+
+    Inst_VOP3__V_MAD_U16::~Inst_VOP3__V_MAD_U16()
+    {
+    } // ~Inst_VOP3__V_MAD_U16
+
+    // --- description from .arch file ---
+    // D.u16 = S0.u16 * S1.u16 + S2.u16.
+    // Supports saturation (unsigned 16-bit integer domain).
+    void
+    Inst_VOP3__V_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
+        VecOperandU16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAD_I16 class methods ---
+
+    Inst_VOP3__V_MAD_I16::Inst_VOP3__V_MAD_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mad_i16", false)
+    {
+        setFlag(ALU);
+        setFlag(MAD);
+    } // Inst_VOP3__V_MAD_I16
+
+    Inst_VOP3__V_MAD_I16::~Inst_VOP3__V_MAD_I16()
+    {
+    } // ~Inst_VOP3__V_MAD_I16
+
+    // --- description from .arch file ---
+    // D.i16 = S0.i16 * S1.i16 + S2.i16.
+    // Supports saturation (signed 16-bit integer domain).
+    void
+    Inst_VOP3__V_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
+        VecOperandI16 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] * src1[lane] + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_PERM_B32 class methods ---
+
+    Inst_VOP3__V_PERM_B32::Inst_VOP3__V_PERM_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_perm_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_PERM_B32
+
+    Inst_VOP3__V_PERM_B32::~Inst_VOP3__V_PERM_B32()
+    {
+    } // ~Inst_VOP3__V_PERM_B32
+
+    // --- description from .arch file ---
+    // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
+    // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
+    // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
+    // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
+    // byte permute(byte in[8], byte sel) {
+    //     if (sel>=13) then return 0xff;
+    //     elsif(sel==12) then return 0x00;
+    //     elsif(sel==11) then return in[7][7] * 0xff;
+    //     elsif(sel==10) then return in[5][7] * 0xff;
+    //     elsif(sel==9) then return in[3][7] * 0xff;
+    //     elsif(sel==8) then return in[1][7] * 0xff;
+    //     else return in[sel];
+    //     }
+    // Byte permute.
+    void
+    Inst_VOP3__V_PERM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemU64 selector = (VecElemU64)src0[lane];
+                selector = (selector << 32) | (VecElemU64)src1[lane];
+                vdst[lane] = 0;
+
+                DPRINTF(VEGA, "Executing v_perm_b32 src_0 0x%08x, src_1 "
+                        "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
+                        src1[lane], src2[lane], vdst[lane]);
+                DPRINTF(VEGA, "Selector: 0x%08x \n", selector);
+
+                for (int i = 0; i < 4 ; ++i) {
+                    VecElemU32 permuted_val = permute(selector, 0xFF
+                        & ((VecElemU32)src2[lane] >> (8 * i)));
+                    vdst[lane] |= (permuted_val << (8 * i));
+                }
+
+                DPRINTF(VEGA, "v_perm result: 0x%08x\n", vdst[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_FMA_F16 class methods ---
+
+    Inst_VOP3__V_FMA_F16::Inst_VOP3__V_FMA_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fma_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMA_F16
+
+    Inst_VOP3__V_FMA_F16::~Inst_VOP3__V_FMA_F16()
+    {
+    } // ~Inst_VOP3__V_FMA_F16
+
+    // --- description from .arch file ---
+    // D.f16 = S0.f16 * S1.f16 + S2.f16.
+    // Fused half precision multiply add.
+    void
+    Inst_VOP3__V_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_DIV_FIXUP_F16 class methods ---
+
+    Inst_VOP3__V_DIV_FIXUP_F16::Inst_VOP3__V_DIV_FIXUP_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_div_fixup_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_DIV_FIXUP_F16
+
+    Inst_VOP3__V_DIV_FIXUP_F16::~Inst_VOP3__V_DIV_FIXUP_F16()
+    {
+    } // ~Inst_VOP3__V_DIV_FIXUP_F16
+
+    // --- description from .arch file ---
+    // sign_out =  sign(S1.f16)^sign(S2.f16);
+    // if (S2.f16 == NAN)
+    //     D.f16 = Quiet(S2.f16);
+    // else if (S1.f16 == NAN)
+    //     D.f16 = Quiet(S1.f16);
+    // else if (S1.f16 == S2.f16 == 0)
+    //     # 0/0
+    //     D.f16 = pele_nan(0xfe00);
+    // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
+    //     # inf/inf
+    //     D.f16 = pele_nan(0xfe00);
+    // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
+    //     # x/0, or inf/y
+    //     D.f16 = sign_out ? -INF : INF;
+    // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
+    //     # x/inf, 0/y
+    //     D.f16 = sign_out ? -0 : 0;
+    // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
+    //     D.f16 = sign_out ? -underflow : underflow;
+    // else if (exp(S1.f16) == 255)
+    //     D.f16 = sign_out ? -overflow : overflow;
+    // else
+    //     D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
+    // Half precision division fixup.
+    // S0 = Quotient, S1 = Denominator, S3 = Numerator.
+    // Given a numerator, denominator, and quotient from a divide, this opcode
+    // will detect and apply special case numerics, touching up the quotient if
+    // necessary. This opcode also generates invalid, denorm and divide by
+    // zero exceptions caused by the division.
+    void
+    Inst_VOP3__V_DIV_FIXUP_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_LSHL_ADD_U64 class methods ---
+
+    Inst_VOP3__V_LSHL_ADD_U64::Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshl_add_u64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHL_ADD_U64
+
+    Inst_VOP3__V_LSHL_ADD_U64::~Inst_VOP3__V_LSHL_ADD_U64()
+    {
+    } // ~Inst_VOP3__V_LSHL_ADD_U64
+
+    // --- description from .arch file ---
+    // D.u = (S0.u << S1.u[4:0]) + S2.u.
+    void
+    Inst_VOP3__V_LSHL_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int shift_amount = bits(src1[lane], 2, 0);
+                shift_amount = shift_amount > 4 ? 0 : shift_amount;
+                vdst[lane] = (src0[lane] << shift_amount)
+                           + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pkaccum_u8_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKACCUM_U8_F32
+
+    Inst_VOP3__V_CVT_PKACCUM_U8_F32::~Inst_VOP3__V_CVT_PKACCUM_U8_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32
+
+    // --- description from .arch file ---
+    // byte = S1.u[1:0]; bit = byte * 8;
+    // D.u[bit+7:bit] = flt32_to_uint8(S0.f);
+    // Pack converted value of S0.f into byte S1 of the destination.
+    // SQ translates to V_CVT_PK_U8_F32.
+    // Note: this opcode uses src_c to pass destination in as a source.
+    void
+    Inst_VOP3__V_CVT_PKACCUM_U8_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P1_F32 class methods ---
+
+    Inst_VOP3__V_INTERP_P1_F32::Inst_VOP3__V_INTERP_P1_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p1_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_INTERP_P1_F32
+
+    Inst_VOP3__V_INTERP_P1_F32::~Inst_VOP3__V_INTERP_P1_F32()
+    {
+    } // ~Inst_VOP3__V_INTERP_P1_F32
+
+    // --- description from .arch file ---
+    // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S; if
+    // D == S then data corruption will occur.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P1_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P2_F32 class methods ---
+
+    Inst_VOP3__V_INTERP_P2_F32::Inst_VOP3__V_INTERP_P2_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p2_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_INTERP_P2_F32
+
+    Inst_VOP3__V_INTERP_P2_F32::~Inst_VOP3__V_INTERP_P2_F32()
+    {
+    } // ~Inst_VOP3__V_INTERP_P2_F32
+
+    // --- description from .arch file ---
+    // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
+    // V_MAD_F32 for SP).
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P2_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_MOV_F32 class methods ---
+
+    Inst_VOP3__V_INTERP_MOV_F32::Inst_VOP3__V_INTERP_MOV_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_mov_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_INTERP_MOV_F32
+
+    Inst_VOP3__V_INTERP_MOV_F32::~Inst_VOP3__V_INTERP_MOV_F32()
+    {
+    } // ~Inst_VOP3__V_INTERP_MOV_F32
+
+    // --- description from .arch file ---
+    // D.f = {P10,P20,P0}[S.u]; parameter load.
+    void
+    Inst_VOP3__V_INTERP_MOV_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P1LL_F16 class methods ---
+
+    Inst_VOP3__V_INTERP_P1LL_F16::Inst_VOP3__V_INTERP_P1LL_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p1ll_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_INTERP_P1LL_F16
+
+    Inst_VOP3__V_INTERP_P1LL_F16::~Inst_VOP3__V_INTERP_P1LL_F16()
+    {
+    } // ~Inst_VOP3__V_INTERP_P1LL_F16
+
+    // --- description from .arch file ---
+    // D.f32 = P10.f16 * S0.f32 + P0.f16.
+    // 'LL' stands for 'two LDS arguments'.
+    // attr_word selects the high or low half 16 bits of each LDS dword
+    // accessed.
+    // This opcode is available for 32-bank LDS only.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P1LL_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P1LV_F16 class methods ---
+
+    Inst_VOP3__V_INTERP_P1LV_F16::Inst_VOP3__V_INTERP_P1LV_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p1lv_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_INTERP_P1LV_F16
+
+    Inst_VOP3__V_INTERP_P1LV_F16::~Inst_VOP3__V_INTERP_P1LV_F16()
+    {
+    } // ~Inst_VOP3__V_INTERP_P1LV_F16
+
+    // --- description from .arch file ---
+    // D.f32 = P10.f16 * S0.f32 + (S2.u32 >> (attr_word * 16)).f16.
+    // 'LV' stands for 'One LDS and one VGPR argument'.
+    // S2 holds two parameters, attr_word selects the high or low word of the
+    // VGPR for this calculation, as well as the high or low half of the LDS
+    // data.
+    // Meant for use with 16-bank LDS.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P1LV_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_INTERP_P2_F16 class methods ---
+
+    Inst_VOP3__V_INTERP_P2_F16::Inst_VOP3__V_INTERP_P2_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_interp_p2_f16", false)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_INTERP_P2_F16
+
+    Inst_VOP3__V_INTERP_P2_F16::~Inst_VOP3__V_INTERP_P2_F16()
+    {
+    } // ~Inst_VOP3__V_INTERP_P2_F16
+
+    // --- description from .arch file ---
+    // D.f16 = P20.f16 * S0.f32 + S2.f32.
+    // Final computation. attr_word selects LDS high or low 16bits. Used for
+    // both 16- and 32-bank LDS.
+    // Result is always written to the 16 LSBs of the destination VGPR.
+    // NOTE: In textual representations the I/J VGPR is the first source and
+    // the attribute is the second source; however in the VOP3 encoding the
+    // attribute is stored in the src0 field and the VGPR is stored in the
+    // src1 field.
+    void
+    Inst_VOP3__V_INTERP_P2_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_ADD_F64 class methods ---
+
+    Inst_VOP3__V_ADD_F64::Inst_VOP3__V_ADD_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_ADD_F64
+
+    Inst_VOP3__V_ADD_F64::~Inst_VOP3__V_ADD_F64()
+    {
+    } // ~Inst_VOP3__V_ADD_F64
+
+    // --- description from .arch file ---
+    // D.d = S0.d + S1.d.
+    void
+    Inst_VOP3__V_ADD_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane]) ) {
+                        vdst[lane] = NAN;
+                } else if (std::isinf(src0[lane]) &&
+                           std::isinf(src1[lane])) {
+                    if (std::signbit(src0[lane]) !=
+                        std::signbit(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else {
+                        vdst[lane] = src0[lane];
+                    }
+                } else if (std::isinf(src0[lane])) {
+                    vdst[lane] = src0[lane];
+                } else if (std::isinf(src1[lane])) {
+                    vdst[lane] = src1[lane];
+                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        if (std::signbit(src0[lane]) &&
+                            std::signbit(src1[lane])) {
+                            vdst[lane] = -0.0;
+                        } else {
+                            vdst[lane] = 0.0;
+                        }
+                    } else {
+                        vdst[lane] = src1[lane];
+                    }
+                } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src1[lane]) == FP_ZERO) {
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src0[lane]) == FP_ZERO) {
+                        if (std::signbit(src0[lane]) &&
+                            std::signbit(src1[lane])) {
+                            vdst[lane] = -0.0;
+                        } else {
+                            vdst[lane] = 0.0;
+                        }
+                    } else {
+                        vdst[lane] = src0[lane];
+                    }
+                } else {
+                    vdst[lane] = src0[lane] + src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_F64 class methods ---
+
+    Inst_VOP3__V_MUL_F64::Inst_VOP3__V_MUL_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_MUL_F64
+
+    Inst_VOP3__V_MUL_F64::~Inst_VOP3__V_MUL_F64()
+    {
+    } // ~Inst_VOP3__V_MUL_F64
+
+    // --- description from .arch file ---
+    // D.d = S0.d * S1.d.
+    void
+    Inst_VOP3__V_MUL_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) ||
+                    std::isnan(src1[lane])) {
+                    vdst[lane] = NAN;
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
+                           std::fpclassify(src0[lane]) == FP_ZERO) &&
+                           std::signbit(src0[lane])) {
+                    if (std::isinf(src1[lane])) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +0.0;
+                    } else {
+                        vdst[lane] = -0.0;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           !std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (!std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else if (std::isinf(src0[lane]) &&
+                           std::signbit(src0[lane])) {
+                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
+                        std::fpclassify(src1[lane]) == FP_ZERO) {
+                        vdst[lane] = NAN;
+                    } else if (std::signbit(src1[lane])) {
+                        vdst[lane] = +INFINITY;
+                    } else {
+                        vdst[lane] = -INFINITY;
+                    }
+                } else {
+                    vdst[lane] = src0[lane] * src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MIN_F64 class methods ---
+
+    Inst_VOP3__V_MIN_F64::Inst_VOP3__V_MIN_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_min_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_MIN_F64
+
+    Inst_VOP3__V_MIN_F64::~Inst_VOP3__V_MIN_F64()
+    {
+    } // ~Inst_VOP3__V_MIN_F64
+
+    // --- description from .arch file ---
+    // D.d = min(S0.d, S1.d).
+    void
+    Inst_VOP3__V_MIN_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmin(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MAX_F64 class methods ---
+
+    Inst_VOP3__V_MAX_F64::Inst_VOP3__V_MAX_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_max_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_MAX_F64
+
+    Inst_VOP3__V_MAX_F64::~Inst_VOP3__V_MAX_F64()
+    {
+    } // ~Inst_VOP3__V_MAX_F64
+
+    // --- description from .arch file ---
+    // D.d = max(S0.d, S1.d).
+    void
+    Inst_VOP3__V_MAX_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::fmax(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LDEXP_F64 class methods ---
+
+    Inst_VOP3__V_LDEXP_F64::Inst_VOP3__V_LDEXP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ldexp_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_LDEXP_F64
+
+    Inst_VOP3__V_LDEXP_F64::~Inst_VOP3__V_LDEXP_F64()
+    {
+    } // ~Inst_VOP3__V_LDEXP_F64
+
+    // --- description from .arch file ---
+    // D.d = pow(S0.d, S1.i[31:0]).
+    void
+    Inst_VOP3__V_LDEXP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
+                    vdst[lane] = src0[lane];
+                } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                           || std::fpclassify(src0[lane]) == FP_ZERO) {
+                    if (std::signbit(src0[lane])) {
+                        vdst[lane] = -0.0;
+                    } else {
+                        vdst[lane] = +0.0;
+                    }
+                } else {
+                    vdst[lane] = std::ldexp(src0[lane], src1[lane]);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_LO_U32 class methods ---
+
+    Inst_VOP3__V_MUL_LO_U32::Inst_VOP3__V_MUL_LO_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_lo_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_LO_U32
+
+    Inst_VOP3__V_MUL_LO_U32::~Inst_VOP3__V_MUL_LO_U32()
+    {
+    } // ~Inst_VOP3__V_MUL_LO_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u * S1.u.
+    void
+    Inst_VOP3__V_MUL_LO_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 s0 = (VecElemI64)src0[lane];
+                VecElemI64 s1 = (VecElemI64)src1[lane];
+                vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_U32 class methods ---
+
+    Inst_VOP3__V_MUL_HI_U32::Inst_VOP3__V_MUL_HI_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_U32
+
+    Inst_VOP3__V_MUL_HI_U32::~Inst_VOP3__V_MUL_HI_U32()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_U32
+
+    // --- description from .arch file ---
+    // D.u = (S0.u * S1.u) >> 32.
+    void
+    Inst_VOP3__V_MUL_HI_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 s0 = (VecElemI64)src0[lane];
+                VecElemI64 s1 = (VecElemI64)src1[lane];
+                vdst[lane]
+                    = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MUL_HI_I32 class methods ---
+
+    Inst_VOP3__V_MUL_HI_I32::Inst_VOP3__V_MUL_HI_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mul_hi_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MUL_HI_I32
+
+    Inst_VOP3__V_MUL_HI_I32::~Inst_VOP3__V_MUL_HI_I32()
+    {
+    } // ~Inst_VOP3__V_MUL_HI_I32
+
+    // --- description from .arch file ---
+    // D.i = (S0.i * S1.i) >> 32.
+    void
+    Inst_VOP3__V_MUL_HI_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandI32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                VecElemI64 s0 = (VecElemI64)src0[lane];
+                VecElemI64 s1 = (VecElemI64)src1[lane];
+                vdst[lane]
+                    = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LDEXP_F32 class methods ---
+
+    Inst_VOP3__V_LDEXP_F32::Inst_VOP3__V_LDEXP_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ldexp_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_LDEXP_F32
+
+    Inst_VOP3__V_LDEXP_F32::~Inst_VOP3__V_LDEXP_F32()
+    {
+    } // ~Inst_VOP3__V_LDEXP_F32
+
+    // --- description from .arch file ---
+    // D.f = pow(S0.f, S1.i)
+    void
+    Inst_VOP3__V_LDEXP_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = std::ldexp(src0[lane], src1[lane]);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_READLANE_B32 class methods ---
+
+    Inst_VOP3__V_READLANE_B32::Inst_VOP3__V_READLANE_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_readlane_b32", true)
+    {
+        setFlag(ALU);
+        setFlag(IgnoreExec);
+    } // Inst_VOP3__V_READLANE_B32
+
+    Inst_VOP3__V_READLANE_B32::~Inst_VOP3__V_READLANE_B32()
+    {
+    } // ~Inst_VOP3__V_READLANE_B32
+
+    // --- description from .arch file ---
+    // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
+    // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
+    // Input and output modifiers not supported; this is an untyped operation.
+    void
+    Inst_VOP3__V_READLANE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        sdst = src0[src1.rawData() & 0x3f];
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_WRITELANE_B32 class methods ---
+
+    Inst_VOP3__V_WRITELANE_B32::Inst_VOP3__V_WRITELANE_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_writelane_b32", false)
+    {
+        setFlag(ALU);
+        setFlag(IgnoreExec);
+    } // Inst_VOP3__V_WRITELANE_B32
+
+    Inst_VOP3__V_WRITELANE_B32::~Inst_VOP3__V_WRITELANE_B32()
+    {
+    } // ~Inst_VOP3__V_WRITELANE_B32
+
+    // --- description from .arch file ---
+    // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
+    // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
+    // exec mask.
+    // Input and output modifiers not supported; this is an untyped operation.
+    // SQ translates to V_MOV_B32.
+    void
+    Inst_VOP3__V_WRITELANE_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.read();
+        src1.read();
+        vdst.read();
+
+        vdst[src1.rawData() & 0x3f] = src0.rawData();
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_BCNT_U32_B32 class methods ---
+
+    Inst_VOP3__V_BCNT_U32_B32::Inst_VOP3__V_BCNT_U32_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bcnt_u32_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BCNT_U32_B32
+
+    Inst_VOP3__V_BCNT_U32_B32::~Inst_VOP3__V_BCNT_U32_B32()
+    {
+    } // ~Inst_VOP3__V_BCNT_U32_B32
+
+    // --- description from .arch file ---
+    // D.u = CountOneBits(S0.u) + S1.u. Bit count.
+    void
+    Inst_VOP3__V_BCNT_U32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = popCount(src0[lane]) + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MBCNT_LO_U32_B32 class methods ---
+
+    Inst_VOP3__V_MBCNT_LO_U32_B32::Inst_VOP3__V_MBCNT_LO_U32_B32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mbcnt_lo_u32_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MBCNT_LO_U32_B32
+
+    Inst_VOP3__V_MBCNT_LO_U32_B32::~Inst_VOP3__V_MBCNT_LO_U32_B32()
+    {
+    } // ~Inst_VOP3__V_MBCNT_LO_U32_B32
+
+    // --- description from .arch file ---
+    // ThreadMask = (1 << ThreadPosition) - 1;
+    // D.u = CountOneBits(S0.u & ThreadMask[31:0]) + S1.u.
+    // Masked bit count, ThreadPosition is the position of this thread in the
+    // ---  wavefront (in 0..63).
+    void
+    Inst_VOP3__V_MBCNT_LO_U32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        uint64_t threadMask = 0;
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                threadMask = ((1ULL << lane) - 1ULL);
+                vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
+                             src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---
+
+    Inst_VOP3__V_MBCNT_HI_U32_B32::Inst_VOP3__V_MBCNT_HI_U32_B32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_mbcnt_hi_u32_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_MBCNT_HI_U32_B32
+
+    Inst_VOP3__V_MBCNT_HI_U32_B32::~Inst_VOP3__V_MBCNT_HI_U32_B32()
+    {
+    } // ~Inst_VOP3__V_MBCNT_HI_U32_B32
+
+    // --- description from .arch file ---
+    // ThreadMask = (1 << ThreadPosition) - 1;
+    // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
+    // Masked bit count, ThreadPosition is the position of this thread in the
+    // ---  wavefront (in 0..63).
+    void
+    Inst_VOP3__V_MBCNT_HI_U32_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        uint64_t threadMask = 0;
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                threadMask = ((1ULL << lane) - 1ULL);
+                vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
+                             src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHLREV_B64 class methods ---
+
+    Inst_VOP3__V_LSHLREV_B64::Inst_VOP3__V_LSHLREV_B64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshlrev_b64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHLREV_B64
+
+    Inst_VOP3__V_LSHLREV_B64::~Inst_VOP3__V_LSHLREV_B64()
+    {
+    } // ~Inst_VOP3__V_LSHLREV_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S1.u64 << S0.u[5:0].
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHLREV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_LSHRREV_B64 class methods ---
+
+    Inst_VOP3__V_LSHRREV_B64::Inst_VOP3__V_LSHRREV_B64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshrrev_b64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHRREV_B64
+
+    Inst_VOP3__V_LSHRREV_B64::~Inst_VOP3__V_LSHRREV_B64()
+    {
+    } // ~Inst_VOP3__V_LSHRREV_B64
+
+    // --- description from .arch file ---
+    // D.u64 = S1.u64 >> S0.u[5:0].
+    // The vacated bits are set to zero.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_LSHRREV_B64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_ASHRREV_I64 class methods ---
+
+    Inst_VOP3__V_ASHRREV_I64::Inst_VOP3__V_ASHRREV_I64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_ashrrev_i64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ASHRREV_I64
+
+    Inst_VOP3__V_ASHRREV_I64::~Inst_VOP3__V_ASHRREV_I64()
+    {
+    } // ~Inst_VOP3__V_ASHRREV_I64
+
+    // --- description from .arch file ---
+    // D.u64 = signext(S1.u64) >> S0.u[5:0].
+    // The vacated bits are set to the sign bit of the input value.
+    // SQ translates this to an internal SP opcode.
+    void
+    Inst_VOP3__V_ASHRREV_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane]
+                    = src1[lane] >> bits(src0[lane], 5, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_TRIG_PREOP_F64 class methods ---
+
+    Inst_VOP3__V_TRIG_PREOP_F64::Inst_VOP3__V_TRIG_PREOP_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_trig_preop_f64", false)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_TRIG_PREOP_F64
+
+    Inst_VOP3__V_TRIG_PREOP_F64::~Inst_VOP3__V_TRIG_PREOP_F64()
+    {
+    } // ~Inst_VOP3__V_TRIG_PREOP_F64
+
+    // --- description from .arch file ---
+    // D.d = Look Up 2/PI (S0.d) with segment select S1.u[4:0]. This operation
+    // returns an aligned, double precision segment of 2/PI needed to do range
+    // reduction on S0.d (double-precision value). Multiple segments can be
+    // specified through S1.u[4:0]. Rounding is always round-to-zero. Large
+    // inputs (exp > 1968) are scaled to avoid loss of precision through
+    // denormalization.
+    void
+    Inst_VOP3__V_TRIG_PREOP_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_BFM_B32 class methods ---
+
+    Inst_VOP3__V_BFM_B32::Inst_VOP3__V_BFM_B32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_bfm_b32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_BFM_B32
+
+    Inst_VOP3__V_BFM_B32::~Inst_VOP3__V_BFM_B32()
+    {
+    } // ~Inst_VOP3__V_BFM_B32
+
+    // --- description from .arch file ---
+    // D.u = ((1<<S0.u[4:0])-1) << S1.u[4:0]; S0 is the bitfield width and S1
+    // is the bitfield offset.
+    void
+    Inst_VOP3__V_BFM_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
+                    << bits(src1[lane], 4, 0);
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKNORM_I16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKNORM_I16_F32::Inst_VOP3__V_CVT_PKNORM_I16_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pknorm_i16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKNORM_I16_F32
+
+    Inst_VOP3__V_CVT_PKNORM_I16_F32::~Inst_VOP3__V_CVT_PKNORM_I16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32
+
+    // --- description from .arch file ---
+    // D = {(snorm)S1.f, (snorm)S0.f}.
+    void
+    Inst_VOP3__V_CVT_PKNORM_I16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKNORM_U16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKNORM_U16_F32::Inst_VOP3__V_CVT_PKNORM_U16_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pknorm_u16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKNORM_U16_F32
+
+    Inst_VOP3__V_CVT_PKNORM_U16_F32::~Inst_VOP3__V_CVT_PKNORM_U16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32
+
+    // --- description from .arch file ---
+    // D = {(unorm)S1.f, (unorm)S0.f}.
+    void
+    Inst_VOP3__V_CVT_PKNORM_U16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PKRTZ_F16_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PKRTZ_F16_F32::Inst_VOP3__V_CVT_PKRTZ_F16_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pkrtz_f16_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CVT_PKRTZ_F16_F32
+
+    Inst_VOP3__V_CVT_PKRTZ_F16_F32::~Inst_VOP3__V_CVT_PKRTZ_F16_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32
+
+    // --- description from .arch file ---
+    // D = {flt32_to_flt16(S1.f),flt32_to_flt16(S0.f)}, with round-toward-zero
+    // ---  regardless of current round mode setting in hardware.
+    // This opcode is intended for use with 16-bit compressed exports.
+    // See V_CVT_F16_F32 for a version that respects the current rounding mode.
+    void
+    Inst_VOP3__V_CVT_PKRTZ_F16_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PK_U16_U32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_U16_U32::Inst_VOP3__V_CVT_PK_U16_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_u16_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CVT_PK_U16_U32
+
+    Inst_VOP3__V_CVT_PK_U16_U32::~Inst_VOP3__V_CVT_PK_U16_U32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_U16_U32
+
+    // --- description from .arch file ---
+    // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
+    void
+    Inst_VOP3__V_CVT_PK_U16_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PK_I16_I32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_I16_I32::Inst_VOP3__V_CVT_PK_I16_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_i16_i32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CVT_PK_I16_I32
+
+    Inst_VOP3__V_CVT_PK_I16_I32::~Inst_VOP3__V_CVT_PK_I16_I32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_I16_I32
+
+    // --- description from .arch file ---
+    // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
+    void
+    Inst_VOP3__V_CVT_PK_I16_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CVT_PK_FP8_F32 class methods ---
+
+    Inst_VOP3__V_CVT_PK_FP8_F32::Inst_VOP3__V_CVT_PK_FP8_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cvt_pk_fp8_f32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CVT_PK_FP8_F32
+
+    Inst_VOP3__V_CVT_PK_FP8_F32::~Inst_VOP3__V_CVT_PK_FP8_F32()
+    {
+    } // ~Inst_VOP3__V_CVT_PK_FP8_F32
+
+    void
+    Inst_VOP3__V_CVT_PK_FP8_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vdst.read(); // Preserve bits
+
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+        panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
+        panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
+
+        unsigned opsel = instData.OPSEL;
+        unsigned abs = instData.ABS;
+        unsigned neg = extData.NEG;
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                AMDGPU::mxfloat8 tmp0(src0[lane]), tmp1(src1[lane]);
+
+                if ((abs & 1) && (tmp0 < 0)) tmp0 = -tmp0;
+                if ((abs & 2) && (tmp1 < 0)) tmp1 = -tmp1;
+                if (neg & 1) tmp0 = -tmp0;
+                if (neg & 2) tmp1 = -tmp1;
+
+                uint16_t packed_data = (bits(tmp0.data, 31, 24) << 8)
+                                     | bits(tmp1.data, 31, 24);
+
+                if (opsel & 8) {
+                    replaceBits(vdst[lane], 31, 16, packed_data);
+                } else {
+                    replaceBits(vdst[lane], 15, 0, packed_data);
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3_cmp.cc b/src/arch/amdgpu/vega/insts/vop3_cmp.cc
new file mode 100644
index 0000000000..4bbec930e6
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3_cmp.cc
@@ -0,0 +1,8145 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/inst_util.hh"
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOP3__V_CMP_CLASS_F32 class methods ---
+
+    Inst_VOP3__V_CMP_CLASS_F32::Inst_VOP3__V_CMP_CLASS_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_class_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_CLASS_F32
+
+    Inst_VOP3__V_CMP_CLASS_F32::~Inst_VOP3__V_CMP_CLASS_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_CLASS_F32
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_CLASS_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_CLASS_F32::Inst_VOP3__V_CMPX_CLASS_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_class_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_CLASS_F32
+
+    Inst_VOP3__V_CMPX_CLASS_F32::~Inst_VOP3__V_CMPX_CLASS_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_CLASS_F32
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.f
+    // The function reports true if the floating point value is *any* of the
+    // numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane,  1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_CLASS_F64 class methods ---
+
+    Inst_VOP3__V_CMP_CLASS_F64::Inst_VOP3__V_CMP_CLASS_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_class_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_CLASS_F64
+
+    Inst_VOP3__V_CMP_CLASS_F64::~Inst_VOP3__V_CMP_CLASS_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_CLASS_F64
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_CLASS_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_CLASS_F64::Inst_VOP3__V_CMPX_CLASS_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_class_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_CLASS_F64
+
+    Inst_VOP3__V_CMPX_CLASS_F64::~Inst_VOP3__V_CMPX_CLASS_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_CLASS_F64
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.d
+    // The function reports true if the floating point value is *any* of the
+    // numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        sdst.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_CLASS_F16 class methods ---
+
+    Inst_VOP3__V_CMP_CLASS_F16::Inst_VOP3__V_CMP_CLASS_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_class_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_CLASS_F16
+
+    Inst_VOP3__V_CMP_CLASS_F16::~Inst_VOP3__V_CMP_CLASS_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_CLASS_F16
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_CLASS_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_CLASS_F16::Inst_VOP3__V_CMPX_CLASS_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_class_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_CLASS_F16
+
+    Inst_VOP3__V_CMPX_CLASS_F16::~Inst_VOP3__V_CMPX_CLASS_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_CLASS_F16
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // ---  S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOP3__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_F16 class methods ---
+
+    Inst_VOP3__V_CMP_F_F16::Inst_VOP3__V_CMP_F_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_F_F16
+
+    Inst_VOP3__V_CMP_F_F16::~Inst_VOP3__V_CMP_F_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_F_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_LT_F16::Inst_VOP3__V_CMP_LT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_LT_F16
+
+    Inst_VOP3__V_CMP_LT_F16::~Inst_VOP3__V_CMP_LT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_F16 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_F16::Inst_VOP3__V_CMP_EQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_EQ_F16
+
+    Inst_VOP3__V_CMP_EQ_F16::~Inst_VOP3__V_CMP_EQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_LE_F16::Inst_VOP3__V_CMP_LE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_LE_F16
+
+    Inst_VOP3__V_CMP_LE_F16::~Inst_VOP3__V_CMP_LE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_GT_F16::Inst_VOP3__V_CMP_GT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_GT_F16
+
+    Inst_VOP3__V_CMP_GT_F16::~Inst_VOP3__V_CMP_GT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LG_F16 class methods ---
+
+    Inst_VOP3__V_CMP_LG_F16::Inst_VOP3__V_CMP_LG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_LG_F16
+
+    Inst_VOP3__V_CMP_LG_F16::~Inst_VOP3__V_CMP_LG_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_LG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_GE_F16::Inst_VOP3__V_CMP_GE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_GE_F16
+
+    Inst_VOP3__V_CMP_GE_F16::~Inst_VOP3__V_CMP_GE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_O_F16 class methods ---
+
+    Inst_VOP3__V_CMP_O_F16::Inst_VOP3__V_CMP_O_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_o_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_O_F16
+
+    Inst_VOP3__V_CMP_O_F16::~Inst_VOP3__V_CMP_O_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_O_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_U_F16 class methods ---
+
+    Inst_VOP3__V_CMP_U_F16::Inst_VOP3__V_CMP_U_F16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_u_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_U_F16
+
+    Inst_VOP3__V_CMP_U_F16::~Inst_VOP3__V_CMP_U_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_U_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NGE_F16::Inst_VOP3__V_CMP_NGE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NGE_F16
+
+    Inst_VOP3__V_CMP_NGE_F16::~Inst_VOP3__V_CMP_NGE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NGE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLG_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NLG_F16::Inst_VOP3__V_CMP_NLG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NLG_F16
+
+    Inst_VOP3__V_CMP_NLG_F16::~Inst_VOP3__V_CMP_NLG_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NLG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NGT_F16::Inst_VOP3__V_CMP_NGT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ngt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NGT_F16
+
+    Inst_VOP3__V_CMP_NGT_F16::~Inst_VOP3__V_CMP_NGT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NGT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLE_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NLE_F16::Inst_VOP3__V_CMP_NLE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nle_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NLE_F16
+
+    Inst_VOP3__V_CMP_NLE_F16::~Inst_VOP3__V_CMP_NLE_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NLE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NEQ_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NEQ_F16::Inst_VOP3__V_CMP_NEQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_neq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NEQ_F16
+
+    Inst_VOP3__V_CMP_NEQ_F16::~Inst_VOP3__V_CMP_NEQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NEQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLT_F16 class methods ---
+
+    Inst_VOP3__V_CMP_NLT_F16::Inst_VOP3__V_CMP_NLT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_NLT_F16
+
+    Inst_VOP3__V_CMP_NLT_F16::~Inst_VOP3__V_CMP_NLT_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_NLT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMP_TRU_F16 class methods ---
+
+    Inst_VOP3__V_CMP_TRU_F16::Inst_VOP3__V_CMP_TRU_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_tru_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOP3__V_CMP_TRU_F16
+
+    Inst_VOP3__V_CMP_TRU_F16::~Inst_VOP3__V_CMP_TRU_F16()
+    {
+    } // ~Inst_VOP3__V_CMP_TRU_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_F_F16::Inst_VOP3__V_CMPX_F_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_F16
+
+    Inst_VOP3__V_CMPX_F_F16::~Inst_VOP3__V_CMPX_F_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_F16::Inst_VOP3__V_CMPX_LT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_F16
+
+    Inst_VOP3__V_CMPX_LT_F16::~Inst_VOP3__V_CMPX_LT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_F16::Inst_VOP3__V_CMPX_EQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_F16
+
+    Inst_VOP3__V_CMPX_EQ_F16::~Inst_VOP3__V_CMPX_EQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_F16::Inst_VOP3__V_CMPX_LE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_F16
+
+    Inst_VOP3__V_CMPX_LE_F16::~Inst_VOP3__V_CMPX_LE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_F16::Inst_VOP3__V_CMPX_GT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_F16
+
+    Inst_VOP3__V_CMPX_GT_F16::~Inst_VOP3__V_CMPX_GT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LG_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_LG_F16::Inst_VOP3__V_CMPX_LG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LG_F16
+
+    Inst_VOP3__V_CMPX_LG_F16::~Inst_VOP3__V_CMPX_LG_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_F16::Inst_VOP3__V_CMPX_GE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_F16
+
+    Inst_VOP3__V_CMPX_GE_F16::~Inst_VOP3__V_CMPX_GE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_O_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_O_F16::Inst_VOP3__V_CMPX_O_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_o_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_O_F16
+
+    Inst_VOP3__V_CMPX_O_F16::~Inst_VOP3__V_CMPX_O_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_O_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_U_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_U_F16::Inst_VOP3__V_CMPX_U_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_u_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_U_F16
+
+    Inst_VOP3__V_CMPX_U_F16::~Inst_VOP3__V_CMPX_U_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_U_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NGE_F16::Inst_VOP3__V_CMPX_NGE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nge_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGE_F16
+
+    Inst_VOP3__V_CMPX_NGE_F16::~Inst_VOP3__V_CMPX_NGE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLG_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NLG_F16::Inst_VOP3__V_CMPX_NLG_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLG_F16
+
+    Inst_VOP3__V_CMPX_NLG_F16::~Inst_VOP3__V_CMPX_NLG_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NGT_F16::Inst_VOP3__V_CMPX_NGT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGT_F16
+
+    Inst_VOP3__V_CMPX_NGT_F16::~Inst_VOP3__V_CMPX_NGT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLE_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NLE_F16::Inst_VOP3__V_CMPX_NLE_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nle_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLE_F16
+
+    Inst_VOP3__V_CMPX_NLE_F16::~Inst_VOP3__V_CMPX_NLE_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NEQ_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NEQ_F16::Inst_VOP3__V_CMPX_NEQ_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_neq_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NEQ_F16
+
+    Inst_VOP3__V_CMPX_NEQ_F16::~Inst_VOP3__V_CMPX_NEQ_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NEQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLT_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_NLT_F16::Inst_VOP3__V_CMPX_NLT_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLT_F16
+
+    Inst_VOP3__V_CMPX_NLT_F16::~Inst_VOP3__V_CMPX_NLT_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_TRU_F16 class methods ---
+
+    Inst_VOP3__V_CMPX_TRU_F16::Inst_VOP3__V_CMPX_TRU_F16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_tru_f16", true)
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_TRU_F16
+
+    Inst_VOP3__V_CMPX_TRU_F16::~Inst_VOP3__V_CMPX_TRU_F16()
+    {
+    } // ~Inst_VOP3__V_CMPX_TRU_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_F32 class methods ---
+
+    Inst_VOP3__V_CMP_F_F32::Inst_VOP3__V_CMP_F_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_F_F32
+
+    Inst_VOP3__V_CMP_F_F32::~Inst_VOP3__V_CMP_F_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_F_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_LT_F32::Inst_VOP3__V_CMP_LT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_LT_F32
+
+    Inst_VOP3__V_CMP_LT_F32::~Inst_VOP3__V_CMP_LT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_F32 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_F32::Inst_VOP3__V_CMP_EQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_EQ_F32
+
+    Inst_VOP3__V_CMP_EQ_F32::~Inst_VOP3__V_CMP_EQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_LE_F32::Inst_VOP3__V_CMP_LE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_LE_F32
+
+    Inst_VOP3__V_CMP_LE_F32::~Inst_VOP3__V_CMP_LE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_GT_F32::Inst_VOP3__V_CMP_GT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_GT_F32
+
+    Inst_VOP3__V_CMP_GT_F32::~Inst_VOP3__V_CMP_GT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LG_F32 class methods ---
+
+    Inst_VOP3__V_CMP_LG_F32::Inst_VOP3__V_CMP_LG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_LG_F32
+
+    Inst_VOP3__V_CMP_LG_F32::~Inst_VOP3__V_CMP_LG_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_LG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_GE_F32::Inst_VOP3__V_CMP_GE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_GE_F32
+
+    Inst_VOP3__V_CMP_GE_F32::~Inst_VOP3__V_CMP_GE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_O_F32 class methods ---
+
+    Inst_VOP3__V_CMP_O_F32::Inst_VOP3__V_CMP_O_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_o_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_O_F32
+
+    Inst_VOP3__V_CMP_O_F32::~Inst_VOP3__V_CMP_O_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_O_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_U_F32 class methods ---
+
+    Inst_VOP3__V_CMP_U_F32::Inst_VOP3__V_CMP_U_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_u_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_U_F32
+
+    Inst_VOP3__V_CMP_U_F32::~Inst_VOP3__V_CMP_U_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_U_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NGE_F32::Inst_VOP3__V_CMP_NGE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NGE_F32
+
+    Inst_VOP3__V_CMP_NGE_F32::~Inst_VOP3__V_CMP_NGE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NGE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLG_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NLG_F32::Inst_VOP3__V_CMP_NLG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NLG_F32
+
+    Inst_VOP3__V_CMP_NLG_F32::~Inst_VOP3__V_CMP_NLG_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NLG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NGT_F32::Inst_VOP3__V_CMP_NGT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ngt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NGT_F32
+
+    Inst_VOP3__V_CMP_NGT_F32::~Inst_VOP3__V_CMP_NGT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NGT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLE_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NLE_F32::Inst_VOP3__V_CMP_NLE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nle_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NLE_F32
+
+    Inst_VOP3__V_CMP_NLE_F32::~Inst_VOP3__V_CMP_NLE_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NLE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NEQ_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NEQ_F32::Inst_VOP3__V_CMP_NEQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_neq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NEQ_F32
+
+    Inst_VOP3__V_CMP_NEQ_F32::~Inst_VOP3__V_CMP_NEQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NEQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLT_F32 class methods ---
+
+    Inst_VOP3__V_CMP_NLT_F32::Inst_VOP3__V_CMP_NLT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_NLT_F32
+
+    Inst_VOP3__V_CMP_NLT_F32::~Inst_VOP3__V_CMP_NLT_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_NLT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_TRU_F32 class methods ---
+
+    Inst_VOP3__V_CMP_TRU_F32::Inst_VOP3__V_CMP_TRU_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_tru_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOP3__V_CMP_TRU_F32
+
+    Inst_VOP3__V_CMP_TRU_F32::~Inst_VOP3__V_CMP_TRU_F32()
+    {
+    } // ~Inst_VOP3__V_CMP_TRU_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_F_F32::Inst_VOP3__V_CMPX_F_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_F32
+
+    Inst_VOP3__V_CMPX_F_F32::~Inst_VOP3__V_CMPX_F_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_F32::Inst_VOP3__V_CMPX_LT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_F32
+
+    Inst_VOP3__V_CMPX_LT_F32::~Inst_VOP3__V_CMPX_LT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_F32::Inst_VOP3__V_CMPX_EQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_F32
+
+    Inst_VOP3__V_CMPX_EQ_F32::~Inst_VOP3__V_CMPX_EQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_F32::Inst_VOP3__V_CMPX_LE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_F32
+
+    Inst_VOP3__V_CMPX_LE_F32::~Inst_VOP3__V_CMPX_LE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_F32::Inst_VOP3__V_CMPX_GT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_F32
+
+    Inst_VOP3__V_CMPX_GT_F32::~Inst_VOP3__V_CMPX_GT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LG_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_LG_F32::Inst_VOP3__V_CMPX_LG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LG_F32
+
+    Inst_VOP3__V_CMPX_LG_F32::~Inst_VOP3__V_CMPX_LG_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_F32::Inst_VOP3__V_CMPX_GE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_F32
+
+    Inst_VOP3__V_CMPX_GE_F32::~Inst_VOP3__V_CMPX_GE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_O_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_O_F32::Inst_VOP3__V_CMPX_O_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_o_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_O_F32
+
+    Inst_VOP3__V_CMPX_O_F32::~Inst_VOP3__V_CMPX_O_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_O_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_U_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_U_F32::Inst_VOP3__V_CMPX_U_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_u_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_U_F32
+
+    Inst_VOP3__V_CMPX_U_F32::~Inst_VOP3__V_CMPX_U_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_U_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                        || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NGE_F32::Inst_VOP3__V_CMPX_NGE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nge_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGE_F32
+
+    Inst_VOP3__V_CMPX_NGE_F32::~Inst_VOP3__V_CMPX_NGE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLG_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NLG_F32::Inst_VOP3__V_CMPX_NLG_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLG_F32
+
+    Inst_VOP3__V_CMPX_NLG_F32::~Inst_VOP3__V_CMPX_NLG_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NGT_F32::Inst_VOP3__V_CMPX_NGT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGT_F32
+
+    Inst_VOP3__V_CMPX_NGT_F32::~Inst_VOP3__V_CMPX_NGT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLE_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NLE_F32::Inst_VOP3__V_CMPX_NLE_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nle_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLE_F32
+
+    Inst_VOP3__V_CMPX_NLE_F32::~Inst_VOP3__V_CMPX_NLE_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NEQ_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NEQ_F32::Inst_VOP3__V_CMPX_NEQ_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_neq_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NEQ_F32
+
+    Inst_VOP3__V_CMPX_NEQ_F32::~Inst_VOP3__V_CMPX_NEQ_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NEQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLT_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_NLT_F32::Inst_VOP3__V_CMPX_NLT_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLT_F32
+
+    Inst_VOP3__V_CMPX_NLT_F32::~Inst_VOP3__V_CMPX_NLT_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_TRU_F32 class methods ---
+
+    Inst_VOP3__V_CMPX_TRU_F32::Inst_VOP3__V_CMPX_TRU_F32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_tru_f32", true)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_TRU_F32
+
+    Inst_VOP3__V_CMPX_TRU_F32::~Inst_VOP3__V_CMPX_TRU_F32()
+    {
+    } // ~Inst_VOP3__V_CMPX_TRU_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_F64 class methods ---
+
+    Inst_VOP3__V_CMP_F_F64::Inst_VOP3__V_CMP_F_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_F_F64
+
+    Inst_VOP3__V_CMP_F_F64::~Inst_VOP3__V_CMP_F_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_F_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_LT_F64::Inst_VOP3__V_CMP_LT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_LT_F64
+
+    Inst_VOP3__V_CMP_LT_F64::~Inst_VOP3__V_CMP_LT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_F64 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_F64::Inst_VOP3__V_CMP_EQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_EQ_F64
+
+    Inst_VOP3__V_CMP_EQ_F64::~Inst_VOP3__V_CMP_EQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_LE_F64::Inst_VOP3__V_CMP_LE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_LE_F64
+
+    Inst_VOP3__V_CMP_LE_F64::~Inst_VOP3__V_CMP_LE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_GT_F64::Inst_VOP3__V_CMP_GT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_GT_F64
+
+    Inst_VOP3__V_CMP_GT_F64::~Inst_VOP3__V_CMP_GT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LG_F64 class methods ---
+
+    Inst_VOP3__V_CMP_LG_F64::Inst_VOP3__V_CMP_LG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_LG_F64
+
+    Inst_VOP3__V_CMP_LG_F64::~Inst_VOP3__V_CMP_LG_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_LG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_GE_F64::Inst_VOP3__V_CMP_GE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_GE_F64
+
+    Inst_VOP3__V_CMP_GE_F64::~Inst_VOP3__V_CMP_GE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_O_F64 class methods ---
+
+    Inst_VOP3__V_CMP_O_F64::Inst_VOP3__V_CMP_O_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_o_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_O_F64
+
+    Inst_VOP3__V_CMP_O_F64::~Inst_VOP3__V_CMP_O_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_O_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_U_F64 class methods ---
+
+    Inst_VOP3__V_CMP_U_F64::Inst_VOP3__V_CMP_U_F64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_u_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_U_F64
+
+    Inst_VOP3__V_CMP_U_F64::~Inst_VOP3__V_CMP_U_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_U_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NGE_F64::Inst_VOP3__V_CMP_NGE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NGE_F64
+
+    Inst_VOP3__V_CMP_NGE_F64::~Inst_VOP3__V_CMP_NGE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NGE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLG_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NLG_F64::Inst_VOP3__V_CMP_NLG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NLG_F64
+
+    Inst_VOP3__V_CMP_NLG_F64::~Inst_VOP3__V_CMP_NLG_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NLG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NGT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NGT_F64::Inst_VOP3__V_CMP_NGT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ngt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NGT_F64
+
+    Inst_VOP3__V_CMP_NGT_F64::~Inst_VOP3__V_CMP_NGT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NGT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLE_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NLE_F64::Inst_VOP3__V_CMP_NLE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nle_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NLE_F64
+
+    Inst_VOP3__V_CMP_NLE_F64::~Inst_VOP3__V_CMP_NLE_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NLE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NEQ_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NEQ_F64::Inst_VOP3__V_CMP_NEQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_neq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NEQ_F64
+
+    Inst_VOP3__V_CMP_NEQ_F64::~Inst_VOP3__V_CMP_NEQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NEQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NLT_F64 class methods ---
+
+    Inst_VOP3__V_CMP_NLT_F64::Inst_VOP3__V_CMP_NLT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_nlt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_NLT_F64
+
+    Inst_VOP3__V_CMP_NLT_F64::~Inst_VOP3__V_CMP_NLT_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_NLT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_TRU_F64 class methods ---
+
+    Inst_VOP3__V_CMP_TRU_F64::Inst_VOP3__V_CMP_TRU_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_tru_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOP3__V_CMP_TRU_F64
+
+    Inst_VOP3__V_CMP_TRU_F64::~Inst_VOP3__V_CMP_TRU_F64()
+    {
+    } // ~Inst_VOP3__V_CMP_TRU_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_F_F64::Inst_VOP3__V_CMPX_F_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_F64
+
+    Inst_VOP3__V_CMPX_F_F64::~Inst_VOP3__V_CMPX_F_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_F64::Inst_VOP3__V_CMPX_LT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_F64
+
+    Inst_VOP3__V_CMPX_LT_F64::~Inst_VOP3__V_CMPX_LT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_F64::Inst_VOP3__V_CMPX_EQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_F64
+
+    Inst_VOP3__V_CMPX_EQ_F64::~Inst_VOP3__V_CMPX_EQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_F64::Inst_VOP3__V_CMPX_LE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_F64
+
+    Inst_VOP3__V_CMPX_LE_F64::~Inst_VOP3__V_CMPX_LE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_F64::Inst_VOP3__V_CMPX_GT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_F64
+
+    Inst_VOP3__V_CMPX_GT_F64::~Inst_VOP3__V_CMPX_GT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LG_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_LG_F64::Inst_VOP3__V_CMPX_LG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LG_F64
+
+    Inst_VOP3__V_CMPX_LG_F64::~Inst_VOP3__V_CMPX_LG_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_F64::Inst_VOP3__V_CMPX_GE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_F64
+
+    Inst_VOP3__V_CMPX_GE_F64::~Inst_VOP3__V_CMPX_GE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_O_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_O_F64::Inst_VOP3__V_CMPX_O_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_o_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_O_F64
+
+    Inst_VOP3__V_CMPX_O_F64::~Inst_VOP3__V_CMPX_O_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_O_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_U_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_U_F64::Inst_VOP3__V_CMPX_U_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_u_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_U_F64
+
+    Inst_VOP3__V_CMPX_U_F64::~Inst_VOP3__V_CMPX_U_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_U_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOP3__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NGE_F64::Inst_VOP3__V_CMPX_NGE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nge_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGE_F64
+
+    Inst_VOP3__V_CMPX_NGE_F64::~Inst_VOP3__V_CMPX_NGE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLG_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NLG_F64::Inst_VOP3__V_CMPX_NLG_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlg_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLG_F64
+
+    Inst_VOP3__V_CMPX_NLG_F64::~Inst_VOP3__V_CMPX_NLG_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NGT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NGT_F64::Inst_VOP3__V_CMPX_NGT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ngt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NGT_F64
+
+    Inst_VOP3__V_CMPX_NGT_F64::~Inst_VOP3__V_CMPX_NGT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NGT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLE_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NLE_F64::Inst_VOP3__V_CMPX_NLE_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nle_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLE_F64
+
+    Inst_VOP3__V_CMPX_NLE_F64::~Inst_VOP3__V_CMPX_NLE_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NEQ_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NEQ_F64::Inst_VOP3__V_CMPX_NEQ_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_neq_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NEQ_F64
+
+    Inst_VOP3__V_CMPX_NEQ_F64::~Inst_VOP3__V_CMPX_NEQ_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NEQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NLT_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_NLT_F64::Inst_VOP3__V_CMPX_NLT_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_nlt_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NLT_F64
+
+    Inst_VOP3__V_CMPX_NLT_F64::~Inst_VOP3__V_CMPX_NLT_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NLT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_TRU_F64 class methods ---
+
+    Inst_VOP3__V_CMPX_TRU_F64::Inst_VOP3__V_CMPX_TRU_F64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_tru_f64", true)
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_TRU_F64
+
+    Inst_VOP3__V_CMPX_TRU_F64::~Inst_VOP3__V_CMPX_TRU_F64()
+    {
+    } // ~Inst_VOP3__V_CMPX_TRU_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_I16 class methods ---
+
+    Inst_VOP3__V_CMP_F_I16::Inst_VOP3__V_CMP_F_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_I16
+
+    Inst_VOP3__V_CMP_F_I16::~Inst_VOP3__V_CMP_F_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_F_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_I16 class methods ---
+
+    Inst_VOP3__V_CMP_LT_I16::Inst_VOP3__V_CMP_LT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_I16
+
+    Inst_VOP3__V_CMP_LT_I16::~Inst_VOP3__V_CMP_LT_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_I16 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_I16::Inst_VOP3__V_CMP_EQ_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_I16
+
+    Inst_VOP3__V_CMP_EQ_I16::~Inst_VOP3__V_CMP_EQ_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_I16 class methods ---
+
+    Inst_VOP3__V_CMP_LE_I16::Inst_VOP3__V_CMP_LE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_I16
+
+    Inst_VOP3__V_CMP_LE_I16::~Inst_VOP3__V_CMP_LE_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_I16 class methods ---
+
+    Inst_VOP3__V_CMP_GT_I16::Inst_VOP3__V_CMP_GT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_I16
+
+    Inst_VOP3__V_CMP_GT_I16::~Inst_VOP3__V_CMP_GT_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_I16 class methods ---
+
+    Inst_VOP3__V_CMP_NE_I16::Inst_VOP3__V_CMP_NE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_I16
+
+    Inst_VOP3__V_CMP_NE_I16::~Inst_VOP3__V_CMP_NE_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_I16 class methods ---
+
+    Inst_VOP3__V_CMP_GE_I16::Inst_VOP3__V_CMP_GE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_I16
+
+    Inst_VOP3__V_CMP_GE_I16::~Inst_VOP3__V_CMP_GE_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_I16 class methods ---
+
+    Inst_VOP3__V_CMP_T_I16::Inst_VOP3__V_CMP_T_I16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_i16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_I16
+
+    Inst_VOP3__V_CMP_T_I16::~Inst_VOP3__V_CMP_T_I16()
+    {
+    } // ~Inst_VOP3__V_CMP_T_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_U16 class methods ---
+
+    Inst_VOP3__V_CMP_F_U16::Inst_VOP3__V_CMP_F_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_U16
+
+    Inst_VOP3__V_CMP_F_U16::~Inst_VOP3__V_CMP_F_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_F_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_U16 class methods ---
+
+    Inst_VOP3__V_CMP_LT_U16::Inst_VOP3__V_CMP_LT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_U16
+
+    Inst_VOP3__V_CMP_LT_U16::~Inst_VOP3__V_CMP_LT_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_U16 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_U16::Inst_VOP3__V_CMP_EQ_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_U16
+
+    Inst_VOP3__V_CMP_EQ_U16::~Inst_VOP3__V_CMP_EQ_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_U16 class methods ---
+
+    Inst_VOP3__V_CMP_LE_U16::Inst_VOP3__V_CMP_LE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_U16
+
+    Inst_VOP3__V_CMP_LE_U16::~Inst_VOP3__V_CMP_LE_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_U16 class methods ---
+
+    Inst_VOP3__V_CMP_GT_U16::Inst_VOP3__V_CMP_GT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_U16
+
+    Inst_VOP3__V_CMP_GT_U16::~Inst_VOP3__V_CMP_GT_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_U16 class methods ---
+
+    Inst_VOP3__V_CMP_NE_U16::Inst_VOP3__V_CMP_NE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_U16
+
+    Inst_VOP3__V_CMP_NE_U16::~Inst_VOP3__V_CMP_NE_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_U16 class methods ---
+
+    Inst_VOP3__V_CMP_GE_U16::Inst_VOP3__V_CMP_GE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_U16
+
+    Inst_VOP3__V_CMP_GE_U16::~Inst_VOP3__V_CMP_GE_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_U16 class methods ---
+
+    Inst_VOP3__V_CMP_T_U16::Inst_VOP3__V_CMP_T_U16(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_u16", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_U16
+
+    Inst_VOP3__V_CMP_T_U16::~Inst_VOP3__V_CMP_T_U16()
+    {
+    } // ~Inst_VOP3__V_CMP_T_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_F_I16::Inst_VOP3__V_CMPX_F_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_I16
+
+    Inst_VOP3__V_CMPX_F_I16::~Inst_VOP3__V_CMPX_F_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_I16::Inst_VOP3__V_CMPX_LT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_I16
+
+    Inst_VOP3__V_CMPX_LT_I16::~Inst_VOP3__V_CMPX_LT_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_I16::Inst_VOP3__V_CMPX_EQ_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_I16
+
+    Inst_VOP3__V_CMPX_EQ_I16::~Inst_VOP3__V_CMPX_EQ_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_I16::Inst_VOP3__V_CMPX_LE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_I16
+
+    Inst_VOP3__V_CMPX_LE_I16::~Inst_VOP3__V_CMPX_LE_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_I16::Inst_VOP3__V_CMPX_GT_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_I16
+
+    Inst_VOP3__V_CMPX_GT_I16::~Inst_VOP3__V_CMPX_GT_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_I16::Inst_VOP3__V_CMPX_NE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_I16
+
+    Inst_VOP3__V_CMPX_NE_I16::~Inst_VOP3__V_CMPX_NE_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_I16::Inst_VOP3__V_CMPX_GE_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_I16
+
+    Inst_VOP3__V_CMPX_GE_I16::~Inst_VOP3__V_CMPX_GE_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_I16 class methods ---
+
+    Inst_VOP3__V_CMPX_T_I16::Inst_VOP3__V_CMPX_T_I16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_i16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_I16
+
+    Inst_VOP3__V_CMPX_T_I16::~Inst_VOP3__V_CMPX_T_I16()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_F_U16::Inst_VOP3__V_CMPX_F_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_U16
+
+    Inst_VOP3__V_CMPX_F_U16::~Inst_VOP3__V_CMPX_F_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_U16::Inst_VOP3__V_CMPX_LT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_U16
+
+    Inst_VOP3__V_CMPX_LT_U16::~Inst_VOP3__V_CMPX_LT_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_U16::Inst_VOP3__V_CMPX_EQ_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_U16
+
+    Inst_VOP3__V_CMPX_EQ_U16::~Inst_VOP3__V_CMPX_EQ_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_U16::Inst_VOP3__V_CMPX_LE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_U16
+
+    Inst_VOP3__V_CMPX_LE_U16::~Inst_VOP3__V_CMPX_LE_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_U16::Inst_VOP3__V_CMPX_GT_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_U16
+
+    Inst_VOP3__V_CMPX_GT_U16::~Inst_VOP3__V_CMPX_GT_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_U16::Inst_VOP3__V_CMPX_NE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_U16
+
+    Inst_VOP3__V_CMPX_NE_U16::~Inst_VOP3__V_CMPX_NE_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_U16::Inst_VOP3__V_CMPX_GE_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_U16
+
+    Inst_VOP3__V_CMPX_GE_U16::~Inst_VOP3__V_CMPX_GE_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_U16 class methods ---
+
+    Inst_VOP3__V_CMPX_T_U16::Inst_VOP3__V_CMPX_T_U16(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_u16", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_U16
+
+    Inst_VOP3__V_CMPX_T_U16::~Inst_VOP3__V_CMPX_T_U16()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_I32 class methods ---
+
+    Inst_VOP3__V_CMP_F_I32::Inst_VOP3__V_CMP_F_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_I32
+
+    Inst_VOP3__V_CMP_F_I32::~Inst_VOP3__V_CMP_F_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_F_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_I32 class methods ---
+
+    Inst_VOP3__V_CMP_LT_I32::Inst_VOP3__V_CMP_LT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_I32
+
+    Inst_VOP3__V_CMP_LT_I32::~Inst_VOP3__V_CMP_LT_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_I32 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_I32::Inst_VOP3__V_CMP_EQ_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_I32
+
+    Inst_VOP3__V_CMP_EQ_I32::~Inst_VOP3__V_CMP_EQ_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_I32 class methods ---
+
+    Inst_VOP3__V_CMP_LE_I32::Inst_VOP3__V_CMP_LE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_I32
+
+    Inst_VOP3__V_CMP_LE_I32::~Inst_VOP3__V_CMP_LE_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_I32 class methods ---
+
+    Inst_VOP3__V_CMP_GT_I32::Inst_VOP3__V_CMP_GT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_I32
+
+    Inst_VOP3__V_CMP_GT_I32::~Inst_VOP3__V_CMP_GT_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_I32 class methods ---
+
+    Inst_VOP3__V_CMP_NE_I32::Inst_VOP3__V_CMP_NE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_I32
+
+    Inst_VOP3__V_CMP_NE_I32::~Inst_VOP3__V_CMP_NE_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_I32 class methods ---
+
+    Inst_VOP3__V_CMP_GE_I32::Inst_VOP3__V_CMP_GE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_I32
+
+    Inst_VOP3__V_CMP_GE_I32::~Inst_VOP3__V_CMP_GE_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_I32 class methods ---
+
+    Inst_VOP3__V_CMP_T_I32::Inst_VOP3__V_CMP_T_I32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_i32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_I32
+
+    Inst_VOP3__V_CMP_T_I32::~Inst_VOP3__V_CMP_T_I32()
+    {
+    } // ~Inst_VOP3__V_CMP_T_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_U32 class methods ---
+
+    Inst_VOP3__V_CMP_F_U32::Inst_VOP3__V_CMP_F_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_U32
+
+    Inst_VOP3__V_CMP_F_U32::~Inst_VOP3__V_CMP_F_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_F_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_U32 class methods ---
+
+    Inst_VOP3__V_CMP_LT_U32::Inst_VOP3__V_CMP_LT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_U32
+
+    Inst_VOP3__V_CMP_LT_U32::~Inst_VOP3__V_CMP_LT_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_U32 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_U32::Inst_VOP3__V_CMP_EQ_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_U32
+
+    Inst_VOP3__V_CMP_EQ_U32::~Inst_VOP3__V_CMP_EQ_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_U32 class methods ---
+
+    Inst_VOP3__V_CMP_LE_U32::Inst_VOP3__V_CMP_LE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_U32
+
+    Inst_VOP3__V_CMP_LE_U32::~Inst_VOP3__V_CMP_LE_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_U32 class methods ---
+
+    Inst_VOP3__V_CMP_GT_U32::Inst_VOP3__V_CMP_GT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_U32
+
+    Inst_VOP3__V_CMP_GT_U32::~Inst_VOP3__V_CMP_GT_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_U32 class methods ---
+
+    Inst_VOP3__V_CMP_NE_U32::Inst_VOP3__V_CMP_NE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_U32
+
+    Inst_VOP3__V_CMP_NE_U32::~Inst_VOP3__V_CMP_NE_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_U32 class methods ---
+
+    Inst_VOP3__V_CMP_GE_U32::Inst_VOP3__V_CMP_GE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_U32
+
+    Inst_VOP3__V_CMP_GE_U32::~Inst_VOP3__V_CMP_GE_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_U32 class methods ---
+
+    Inst_VOP3__V_CMP_T_U32::Inst_VOP3__V_CMP_T_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_u32", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_U32
+
+    Inst_VOP3__V_CMP_T_U32::~Inst_VOP3__V_CMP_T_U32()
+    {
+    } // ~Inst_VOP3__V_CMP_T_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_F_I32::Inst_VOP3__V_CMPX_F_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_I32
+
+    Inst_VOP3__V_CMPX_F_I32::~Inst_VOP3__V_CMPX_F_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_I32::Inst_VOP3__V_CMPX_LT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_I32
+
+    Inst_VOP3__V_CMPX_LT_I32::~Inst_VOP3__V_CMPX_LT_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_I32::Inst_VOP3__V_CMPX_EQ_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_I32
+
+    Inst_VOP3__V_CMPX_EQ_I32::~Inst_VOP3__V_CMPX_EQ_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_I32::Inst_VOP3__V_CMPX_LE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_I32
+
+    Inst_VOP3__V_CMPX_LE_I32::~Inst_VOP3__V_CMPX_LE_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_I32::Inst_VOP3__V_CMPX_GT_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_I32
+
+    Inst_VOP3__V_CMPX_GT_I32::~Inst_VOP3__V_CMPX_GT_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_I32::Inst_VOP3__V_CMPX_NE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_I32
+
+    Inst_VOP3__V_CMPX_NE_I32::~Inst_VOP3__V_CMPX_NE_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_I32::Inst_VOP3__V_CMPX_GE_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_I32
+
+    Inst_VOP3__V_CMPX_GE_I32::~Inst_VOP3__V_CMPX_GE_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_I32 class methods ---
+
+    Inst_VOP3__V_CMPX_T_I32::Inst_VOP3__V_CMPX_T_I32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_i32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_I32
+
+    Inst_VOP3__V_CMPX_T_I32::~Inst_VOP3__V_CMPX_T_I32()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_F_U32::Inst_VOP3__V_CMPX_F_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_U32
+
+    Inst_VOP3__V_CMPX_F_U32::~Inst_VOP3__V_CMPX_F_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_U32::Inst_VOP3__V_CMPX_LT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_U32
+
+    Inst_VOP3__V_CMPX_LT_U32::~Inst_VOP3__V_CMPX_LT_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_U32::Inst_VOP3__V_CMPX_EQ_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_U32
+
+    Inst_VOP3__V_CMPX_EQ_U32::~Inst_VOP3__V_CMPX_EQ_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_U32::Inst_VOP3__V_CMPX_LE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_U32
+
+    Inst_VOP3__V_CMPX_LE_U32::~Inst_VOP3__V_CMPX_LE_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_U32::Inst_VOP3__V_CMPX_GT_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_U32
+
+    Inst_VOP3__V_CMPX_GT_U32::~Inst_VOP3__V_CMPX_GT_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_U32::Inst_VOP3__V_CMPX_NE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_U32
+
+    Inst_VOP3__V_CMPX_NE_U32::~Inst_VOP3__V_CMPX_NE_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_U32::Inst_VOP3__V_CMPX_GE_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_U32
+
+    Inst_VOP3__V_CMPX_GE_U32::~Inst_VOP3__V_CMPX_GE_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_U32 class methods ---
+
+    Inst_VOP3__V_CMPX_T_U32::Inst_VOP3__V_CMPX_T_U32(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_u32", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_U32
+
+    Inst_VOP3__V_CMPX_T_U32::~Inst_VOP3__V_CMPX_T_U32()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_I64 class methods ---
+
+    Inst_VOP3__V_CMP_F_I64::Inst_VOP3__V_CMP_F_I64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_I64
+
+    Inst_VOP3__V_CMP_F_I64::~Inst_VOP3__V_CMP_F_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_F_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_I64 class methods ---
+
+    Inst_VOP3__V_CMP_LT_I64::Inst_VOP3__V_CMP_LT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_I64
+
+    Inst_VOP3__V_CMP_LT_I64::~Inst_VOP3__V_CMP_LT_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_I64 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_I64::Inst_VOP3__V_CMP_EQ_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_I64
+
+    Inst_VOP3__V_CMP_EQ_I64::~Inst_VOP3__V_CMP_EQ_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_I64 class methods ---
+
+    Inst_VOP3__V_CMP_LE_I64::Inst_VOP3__V_CMP_LE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_I64
+
+    Inst_VOP3__V_CMP_LE_I64::~Inst_VOP3__V_CMP_LE_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_I64 class methods ---
+
+    Inst_VOP3__V_CMP_GT_I64::Inst_VOP3__V_CMP_GT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_I64
+
+    Inst_VOP3__V_CMP_GT_I64::~Inst_VOP3__V_CMP_GT_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_I64 class methods ---
+
+    Inst_VOP3__V_CMP_NE_I64::Inst_VOP3__V_CMP_NE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_I64
+
+    Inst_VOP3__V_CMP_NE_I64::~Inst_VOP3__V_CMP_NE_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_I64 class methods ---
+
+    Inst_VOP3__V_CMP_GE_I64::Inst_VOP3__V_CMP_GE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_I64
+
+    Inst_VOP3__V_CMP_GE_I64::~Inst_VOP3__V_CMP_GE_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_I64 class methods ---
+
+    Inst_VOP3__V_CMP_T_I64::Inst_VOP3__V_CMP_T_I64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_i64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_I64
+
+    Inst_VOP3__V_CMP_T_I64::~Inst_VOP3__V_CMP_T_I64()
+    {
+    } // ~Inst_VOP3__V_CMP_T_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_F_U64 class methods ---
+
+    Inst_VOP3__V_CMP_F_U64::Inst_VOP3__V_CMP_F_U64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_f_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_F_U64
+
+    Inst_VOP3__V_CMP_F_U64::~Inst_VOP3__V_CMP_F_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_F_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LT_U64 class methods ---
+
+    Inst_VOP3__V_CMP_LT_U64::Inst_VOP3__V_CMP_LT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_lt_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LT_U64
+
+    Inst_VOP3__V_CMP_LT_U64::~Inst_VOP3__V_CMP_LT_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_LT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_EQ_U64 class methods ---
+
+    Inst_VOP3__V_CMP_EQ_U64::Inst_VOP3__V_CMP_EQ_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_eq_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_EQ_U64
+
+    Inst_VOP3__V_CMP_EQ_U64::~Inst_VOP3__V_CMP_EQ_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_EQ_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_LE_U64 class methods ---
+
+    Inst_VOP3__V_CMP_LE_U64::Inst_VOP3__V_CMP_LE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_le_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_LE_U64
+
+    Inst_VOP3__V_CMP_LE_U64::~Inst_VOP3__V_CMP_LE_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_LE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GT_U64 class methods ---
+
+    Inst_VOP3__V_CMP_GT_U64::Inst_VOP3__V_CMP_GT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_gt_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GT_U64
+
+    Inst_VOP3__V_CMP_GT_U64::~Inst_VOP3__V_CMP_GT_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_GT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_NE_U64 class methods ---
+
+    Inst_VOP3__V_CMP_NE_U64::Inst_VOP3__V_CMP_NE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ne_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_NE_U64
+
+    Inst_VOP3__V_CMP_NE_U64::~Inst_VOP3__V_CMP_NE_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_NE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_GE_U64 class methods ---
+
+    Inst_VOP3__V_CMP_GE_U64::Inst_VOP3__V_CMP_GE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_ge_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_GE_U64
+
+    Inst_VOP3__V_CMP_GE_U64::~Inst_VOP3__V_CMP_GE_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_GE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMP_T_U64 class methods ---
+
+    Inst_VOP3__V_CMP_T_U64::Inst_VOP3__V_CMP_T_U64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmp_t_u64", true)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_CMP_T_U64
+
+    Inst_VOP3__V_CMP_T_U64::~Inst_VOP3__V_CMP_T_U64()
+    {
+    } // ~Inst_VOP3__V_CMP_T_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_F_I64::Inst_VOP3__V_CMPX_F_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_I64
+
+    Inst_VOP3__V_CMPX_F_I64::~Inst_VOP3__V_CMPX_F_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_I64::Inst_VOP3__V_CMPX_LT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_I64
+
+    Inst_VOP3__V_CMPX_LT_I64::~Inst_VOP3__V_CMPX_LT_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_I64::Inst_VOP3__V_CMPX_EQ_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_I64
+
+    Inst_VOP3__V_CMPX_EQ_I64::~Inst_VOP3__V_CMPX_EQ_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_I64::Inst_VOP3__V_CMPX_LE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_I64
+
+    Inst_VOP3__V_CMPX_LE_I64::~Inst_VOP3__V_CMPX_LE_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_I64::Inst_VOP3__V_CMPX_GT_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_I64
+
+    Inst_VOP3__V_CMPX_GT_I64::~Inst_VOP3__V_CMPX_GT_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_I64::Inst_VOP3__V_CMPX_NE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_I64
+
+    Inst_VOP3__V_CMPX_NE_I64::~Inst_VOP3__V_CMPX_NE_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_I64::Inst_VOP3__V_CMPX_GE_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_I64
+
+    Inst_VOP3__V_CMPX_GE_I64::~Inst_VOP3__V_CMPX_GE_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_I64 class methods ---
+
+    Inst_VOP3__V_CMPX_T_I64::Inst_VOP3__V_CMPX_T_I64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_i64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_I64
+
+    Inst_VOP3__V_CMPX_T_I64::~Inst_VOP3__V_CMPX_T_I64()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_F_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_F_U64::Inst_VOP3__V_CMPX_F_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_f_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_F_U64
+
+    Inst_VOP3__V_CMPX_F_U64::~Inst_VOP3__V_CMPX_F_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_F_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LT_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_LT_U64::Inst_VOP3__V_CMPX_LT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_lt_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LT_U64
+
+    Inst_VOP3__V_CMPX_LT_U64::~Inst_VOP3__V_CMPX_LT_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_EQ_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_EQ_U64::Inst_VOP3__V_CMPX_EQ_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_eq_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_EQ_U64
+
+    Inst_VOP3__V_CMPX_EQ_U64::~Inst_VOP3__V_CMPX_EQ_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_EQ_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_LE_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_LE_U64::Inst_VOP3__V_CMPX_LE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_le_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_LE_U64
+
+    Inst_VOP3__V_CMPX_LE_U64::~Inst_VOP3__V_CMPX_LE_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_LE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GT_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_GT_U64::Inst_VOP3__V_CMPX_GT_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_gt_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GT_U64
+
+    Inst_VOP3__V_CMPX_GT_U64::~Inst_VOP3__V_CMPX_GT_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_NE_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_NE_U64::Inst_VOP3__V_CMPX_NE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ne_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_NE_U64
+
+    Inst_VOP3__V_CMPX_NE_U64::~Inst_VOP3__V_CMPX_NE_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_NE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_GE_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_GE_U64::Inst_VOP3__V_CMPX_GE_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_ge_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_GE_U64
+
+    Inst_VOP3__V_CMPX_GE_U64::~Inst_VOP3__V_CMPX_GE_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_GE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+    // --- Inst_VOP3__V_CMPX_T_U64 class methods ---
+
+    Inst_VOP3__V_CMPX_T_U64::Inst_VOP3__V_CMPX_T_U64(
+          InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_cmpx_t_u64", true)
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOP3__V_CMPX_T_U64
+
+    Inst_VOP3__V_CMPX_T_U64::~Inst_VOP3__V_CMPX_T_U64()
+    {
+    } // ~Inst_VOP3__V_CMPX_T_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOP3__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 sdst(gpuDynInst, instData.VDST);
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                sdst.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = sdst.rawData();
+        sdst.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc
new file mode 100644
index 0000000000..96c296df67
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -0,0 +1,913 @@
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/vop3p.hh"
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "arch/arm/insts/fplib.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+
+using half = uint16_t;
+
+// Helper functions
+template<int N>
+int32_t
+dotClampI(int32_t value, bool clamp)
+{
+    // Only valid for N < 32
+    static_assert(N < 32);
+
+    if (!clamp) {
+        return static_cast<int32_t>(value);
+    }
+
+    int32_t min = -(1 << (N - 1));
+    int32_t max = (1 << (N - 1)) - 1;
+    return std::clamp<int32_t>(value, min, max);
+}
+
+template<int N>
+uint32_t
+dotClampU(uint32_t value, bool clamp)
+{
+    // Only valid for N < 32
+    static_assert(N < 32);
+
+    if (!clamp) {
+        return static_cast<int32_t>(value);
+    }
+
+    uint32_t min = 0;
+    uint32_t max = (1 << N) - 1;
+    return std::clamp<int32_t>(value, min, max);
+}
+
+int16_t
+clampI16(int32_t value, bool clamp)
+{
+    if (!clamp) {
+        return static_cast<int16_t>(value);
+    }
+
+    return std::clamp(value,
+            static_cast<int32_t>(std::numeric_limits<int16_t>::min()),
+            static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
+}
+
+uint16_t
+clampU16(uint32_t value, bool clamp)
+{
+    if (!clamp) {
+        return static_cast<uint16_t>(value);
+    }
+
+    return std::clamp(value,
+            static_cast<uint32_t>(std::numeric_limits<uint16_t>::min()),
+            static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()));
+}
+
+uint16_t
+clampF16(uint16_t value, bool clamp)
+{
+    if (!clamp) {
+        return value;
+    }
+
+    // Values of one and zero in fp16.
+    constexpr uint16_t one = 0x3c00;
+    constexpr uint16_t zero = 0x0;
+    ArmISA::FPSCR fpscr1, fpscr2;
+
+    // If value > one, set to one, then if value < zero set to zero.
+    uint16_t imm = fplibMin(value, one, fpscr1);
+    return fplibMax(imm, zero, fpscr2);
+}
+
+float
+clampF32(float value, bool clamp)
+{
+    if (!clamp) {
+        return value;
+    }
+
+    return std::clamp(value, 0.0f, 1.0f);
+}
+
+
+
+
+// Begin instruction execute definitions
+void Inst_VOP3P__V_PK_MAD_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](int16_t S0, int16_t S1, int16_t S2, bool clamp) -> int16_t
+    {
+        return clampI16(S0 * S1 + S2, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void
+Inst_VOP3P__V_PK_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
+    {
+        // Only return lower 16 bits of result - This operation cannot clamp.
+        uint32_t D = S0 * S1;
+        uint16_t Dh = D & 0xFFFF;
+        return Dh;
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ADD_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16(S0 + S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_SUB_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16(S0 - S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
+    {
+        unsigned shift_val = bits(S0, 3, 0);
+
+        // Shift does not clamp
+        return S1 << shift_val;
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
+    {
+        unsigned shift_val = bits(S0, 3, 0);
+
+        return S1 >> shift_val;
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ASHRREV_B16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        // Sign extend to larger type to ensure we don't lose sign bits when
+        // shifting.
+        int32_t S1e = S1;
+        unsigned shift_val = bits(S0, 3, 0);
+
+        return S1e >> shift_val;
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAX_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16((S0 >= S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MIN_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
+    {
+        return clampI16((S0 < S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<int16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAD_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint16_t S0, uint16_t S1, uint16_t S2, bool clamp) -> uint16_t
+    {
+        return clampU16(S0 * S1 + S2, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ADD_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16(S0 + S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_SUB_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16(S0 - S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAX_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16((S0 >= S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MIN_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
+    {
+        return clampU16((S0 < S1) ? S0 : S1, clamp);
+    };
+
+    vop3pHelper<uint16_t>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_FMA_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, half S2, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMulAdd(S2, S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_ADD_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibAdd(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MUL_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMul(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MIN_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMin(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_PK_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl = [](half S0, half S1, bool clamp) -> half
+    {
+        ArmISA::FPSCR fpscr;
+        return clampF16(fplibMax(S0, S1, fpscr), clamp);
+    };
+
+    vop3pHelper<half>(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT2_F32_F16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        half S0[elems];
+        half S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        float S2 = *reinterpret_cast<float*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        half C[elems];
+        float Csum = 0.0f;
+
+        for (int i = 0; i < elems; ++i) {
+            ArmISA::FPSCR fpscr;
+            C[i] = fplibMul(S0[i], S1[i], fpscr);
+            uint32_t conv =
+                ArmISA::fplibConvert<uint16_t, uint32_t>(
+                        C[i], ArmISA::FPRounding_TIEEVEN, fpscr);
+            Csum += clampF32(*reinterpret_cast<float*>(&conv), clamp);
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT2_I32_I16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT2_U32_U16::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 16;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT4_I32_I8::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 8;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT4_U32_U8::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 8;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT8_I32_I4::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 4;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
+
+        // Compute components individually to prevent overflow across packing
+        int32_t C[elems];
+        int32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
+            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
+            Csum += C[i];
+        }
+
+        Csum += S2;
+        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
+
+        return rv;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_DOT8_U32_U4::execute(GPUDynInstPtr gpuDynInst)
+{
+    auto opImpl =
+        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
+    {
+        constexpr unsigned INBITS = 4;
+
+        constexpr unsigned elems = 32 / INBITS;
+        uint32_t S0[elems];
+        uint32_t S1[elems];
+
+        for (int i = 0; i < elems; ++i) {
+            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
+            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
+        }
+
+        // Compute components individually to prevent overflow across packing
+        uint32_t C[elems];
+        uint32_t Csum = 0;
+
+        for (int i = 0; i < elems; ++i) {
+            C[i] = S0[i] * S1[i];
+            C[i] = dotClampU<INBITS>(C[i], clamp);
+            Csum += C[i];
+        }
+
+        Csum += S2;
+
+        return Csum;
+    };
+
+    dotHelper(gpuDynInst, opImpl);
+}
+
+void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *wf = gpuDynInst->wavefront();
+    unsigned accum_offset = wf->accumOffset;
+
+    ConstVecOperandU32 src(gpuDynInst, extData.SRC0+accum_offset);
+    VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+    src.readSrc();
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            vdst[lane] = src[lane];
+        }
+    }
+
+    vdst.write();
+}
+
+void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *wf = gpuDynInst->wavefront();
+    unsigned accum_offset = wf->accumOffset;
+
+    ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
+    VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset);
+
+    src.readSrc();
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            vdst[lane] = src[lane];
+        }
+    }
+
+    vdst.write();
+}
+
+// --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
+
+Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_fma_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_FMA_F32
+
+Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32()
+{
+} // ~Inst_VOP3P__V_PK_FMA_F32
+
+// D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
+//     S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
+void
+Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+    src2.readSrc();
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
+
+    int neg = extData.NEG;
+    int neg_hi = instData.NEG_HI;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                       : bits(src0[lane], 31, 0);
+            uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                       : bits(src1[lane], 31, 0);
+            uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
+                                       : bits(src2[lane], 31, 0);
+
+            float s0lf = *reinterpret_cast<float*>(&s0l);
+            float s1lf = *reinterpret_cast<float*>(&s1l);
+            float s2lf = *reinterpret_cast<float*>(&s2l);
+
+            if (neg & 1) s0lf = -s0lf;
+            if (neg & 1) s1lf = -s1lf;
+            if (neg & 1) s2lf = -s2lf;
+
+            float dword1 = std::fma(s0lf, s1lf, s2lf);
+
+            uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                          : bits(src0[lane], 31, 0);
+            uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                          : bits(src1[lane], 31, 0);
+            uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
+                                          : bits(src2[lane], 31, 0);
+
+            float s0hf = *reinterpret_cast<float*>(&s0h);
+            float s1hf = *reinterpret_cast<float*>(&s1h);
+            float s2hf = *reinterpret_cast<float*>(&s2h);
+
+            if (neg_hi & 1) s0hf = -s0hf;
+            if (neg_hi & 1) s1hf = -s1hf;
+            if (neg_hi & 1) s2hf = -s2hf;
+
+            float dword2 = std::fma(s0hf, s1hf, s2hf);
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
+
+Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_mul_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_MUL_F32
+
+Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32()
+{
+} // ~Inst_VOP3P__V_PK_MUL_F32
+
+// D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
+//              S1.f[31:0]
+void
+Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI;
+
+    int neg = extData.NEG;
+    int neg_hi = instData.NEG_HI;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            float ldwordf = *reinterpret_cast<float*>(&lower_dword);
+            float udwordf = *reinterpret_cast<float*>(&upper_dword);
+
+            if (neg & 1) ldwordf = -ldwordf;
+            if (neg & 2) udwordf = -udwordf;
+
+            float dword1 = ldwordf * udwordf;
+
+            lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                         : bits(src0[lane], 31, 0);
+            upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                         : bits(src1[lane], 31, 0);
+
+            ldwordf = *reinterpret_cast<float*>(&lower_dword);
+            udwordf = *reinterpret_cast<float*>(&upper_dword);
+
+            if (neg_hi & 1) ldwordf = -ldwordf;
+            if (neg_hi & 2) udwordf = -udwordf;
+
+            float dword2 = ldwordf * udwordf;
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
+
+Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_add_f32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_ADD_F32
+
+Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32()
+{
+} // ~Inst_VOP3P__V_PK_ADD_F32
+
+// D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
+//              S1.f[31:0]
+void
+Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
+    // values cannot use bitwise operations. Consider the U64 to imply
+    // untyped 64-bits of data.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+    panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
+    int opsel = instData.OPSEL;
+    int opsel_hi = extData.OPSEL_HI;
+
+    int neg = extData.NEG;
+    int neg_hi = instData.NEG_HI;
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            float ldwordf = *reinterpret_cast<float*>(&lower_dword);
+            float udwordf = *reinterpret_cast<float*>(&upper_dword);
+
+            if (neg & 1) ldwordf = -ldwordf;
+            if (neg & 2) udwordf = -udwordf;
+
+            float dword1 = ldwordf + udwordf;
+
+            lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
+                                         : bits(src0[lane], 31, 0);
+            upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
+                                         : bits(src1[lane], 31, 0);
+
+            ldwordf = *reinterpret_cast<float*>(&lower_dword);
+            udwordf = *reinterpret_cast<float*>(&upper_dword);
+
+            if (neg_hi & 1) ldwordf = -ldwordf;
+            if (neg_hi & 2) udwordf = -udwordf;
+
+            float dword2 = ldwordf + udwordf;
+
+            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
+            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
+
+            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
+        }
+    }
+
+    vdst.write();
+} // execute
+// --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
+
+Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)
+    : Inst_VOP3P(iFmt, "v_pk_mov_b32")
+{
+    setFlag(ALU);
+} // Inst_VOP3P__V_PK_MOV_B32
+
+Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32()
+{
+} // ~Inst_VOP3P__V_PK_MOV_B32
+
+// D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].
+void
+Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
+{
+    // This is a special case of packed instructions which operates on
+    // 64-bit inputs/outputs and not 32-bit.
+    Wavefront *wf = gpuDynInst->wavefront();
+    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
+    VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+    src0.readSrc();
+    src1.readSrc();
+
+    // Only OPSEL[1:0] are used
+    // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
+    int opsel = instData.OPSEL;
+
+    warn_if(instData.NEG_HI || extData.NEG,
+            "Negative modifier undefined for %s", _opcode);
+
+    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+        if (wf->execMask(lane)) {
+            // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1
+            uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
+                                               : bits(src0[lane], 31, 0);
+            uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
+                                               : bits(src1[lane], 31, 0);
+
+            vdst[lane] = upper_dword << 32 | lower_dword;
+        }
+    }
+
+    vdst.write();
+} // execute
+
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/insts/vop3p.hh b/src/arch/amdgpu/vega/insts/vop3p.hh
new file mode 100644
index 0000000000..fbb81f12f7
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vop3p.hh
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_VEGA_INSTS_VOP3P_HH__
+#define __ARCH_VEGA_INSTS_VOP3P_HH__
+
+#include "arch/amdgpu/vega/gpu_decoder.hh"
+#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"
+#include "arch/amdgpu/vega/insts/op_encodings.hh"
+#include "debug/VEGA.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // One source operand
+    class Inst_VOP3P__1OP : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__1OP(InFmt_VOP3P *iFmt, const std::string& name)
+            : Inst_VOP3P(iFmt, name)
+        {
+            setFlag(ALU);
+        }
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 1; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src
+                return 4;
+              case 1: // dst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        }
+
+    };
+
+    // Two source operands with two 16-bit values in a dword
+    class Inst_VOP3P__2OP_X16 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__2OP_X16(InFmt_VOP3P *iFmt, const std::string& name)
+            : Inst_VOP3P(iFmt, name)
+        {
+            setFlag(ALU);
+        }
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 4;
+              case 1: // src1
+                return 4;
+              case 2: // dst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        }
+
+    };
+
+    // Three source operands with two 16-bit values in a dword
+    class Inst_VOP3P__3OP_X16 : public Inst_VOP3P
+    {
+      public:
+        Inst_VOP3P__3OP_X16(InFmt_VOP3P *iFmt, const std::string& name)
+            : Inst_VOP3P(iFmt, name)
+        {
+            setFlag(ALU);
+        }
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: // src0
+                return 4;
+              case 1: // src1
+                return 4;
+              case 2: // src2
+                return 4;
+              case 3: // dst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        }
+
+    };
+
+
+
+    // Begin instruction implementations
+    class Inst_VOP3P__V_PK_MAD_I16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAD_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_pk_mad_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MUL_LO_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MUL_LO_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_mul_lo_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ADD_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ADD_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_SUB_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_SUB_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_sub_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_LSHLREV_B16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_LSHLREV_B16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_lshlrev_b16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_LSHRREV_B16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_LSHRREV_B16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_lshrrev_b16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ASHRREV_B16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ASHRREV_B16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_ashrrev_b16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAX_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAX_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MIN_I16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MIN_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAD_U16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAD_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_pk_mad_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ADD_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ADD_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_SUB_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_SUB_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_sub_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAX_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAX_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MIN_U16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MIN_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_FMA_F16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_FMA_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_pk_fma_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_ADD_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_ADD_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MUL_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MUL_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_mul_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MIN_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MIN_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_PK_MAX_F16 : public Inst_VOP3P__2OP_X16
+    {
+      public:
+        Inst_VOP3P__V_PK_MAX_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT2_F32_F16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_F32_F16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_f32_f16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT2_I32_I16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_I32_I16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_i32_i16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT2_U32_U16 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT2_U32_U16(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot2_u32_u16")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT4_I32_I8 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT4_I32_I8(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot4_i32_i8")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT4_U32_U8 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT4_U32_U8(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot4_u32_u8")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT8_I32_I4 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT8_I32_I4(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot8_i32_i4")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_DOT8_U32_U4 : public Inst_VOP3P__3OP_X16
+    {
+      public:
+        Inst_VOP3P__V_DOT8_U32_U4(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__3OP_X16(iFmt, "v_dot8_u32_u4")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_ACCVGPR_READ : public Inst_VOP3P__1OP
+    {
+      public:
+        Inst_VOP3P__V_ACCVGPR_READ(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__1OP(iFmt, "v_accvgpr_read")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+
+    class Inst_VOP3P__V_ACCVGPR_WRITE : public Inst_VOP3P__1OP
+    {
+      public:
+        Inst_VOP3P__V_ACCVGPR_WRITE(InFmt_VOP3P *iFmt)
+            : Inst_VOP3P__1OP(iFmt, "v_accvgpr_write")
+        { }
+
+        void execute(GPUDynInstPtr gpuDynInst) override;
+    };
+} // namespace VegaISA
+} // namespace gem5
+
+#endif // __ARCH_VEGA_INSTS_VOP3P_HH__
diff --git a/src/arch/amdgpu/gcn3/gpu_types.hh b/src/arch/amdgpu/vega/insts/vop3p_mai.cc
similarity index 61%
rename from src/arch/amdgpu/gcn3/gpu_types.hh
rename to src/arch/amdgpu/vega/insts/vop3p_mai.cc
index 4cb862de48..d9bf0dd516 100644
--- a/src/arch/amdgpu/gcn3/gpu_types.hh
+++ b/src/arch/amdgpu/vega/insts/vop3p_mai.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,38 +29,13 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ARCH_GCN3_GPU_TYPES_HH__
-#define __ARCH_GCN3_GPU_TYPES_HH__
-
-#include <cstdint>
+#include "arch/amdgpu/vega/insts/instructions.hh"
+#include "arch/amdgpu/vega/insts/vop3p.hh"
 
 namespace gem5
 {
 
-namespace Gcn3ISA
+namespace VegaISA
 {
-    union InstFormat;
-
-    /**
-     * used to represnt a GPU inst in its raw format. GCN3
-     * instructions may be 32b or 64b, therefore we represent
-     * a raw inst with 64b to ensure that all of its inst data,
-     * including potential immediate values, may be represented
-     * in the worst case.
-     */
-    typedef uint64_t RawMachInst;
-
-    /**
-     * used to represent the encoding of a GCN3 inst. each portion
-     * of a GCN3 inst must be 1 DWORD (32b), so we use a pointer
-     * to InstFormat type (which is 32b). for the case in which we
-     * need multiple DWORDS to represnt a single inst, this pointer
-     * essentialy acts as an array of the DWORDs needed to represent
-     * the entire inst encoding.
-     */
-    typedef InstFormat *MachInst;
-
-} // namespace Gcn3ISA
+} // namespace VegaISA
 } // namespace gem5
-
-#endif // __ARCH_GCN3_GPU_TYPES_HH__
diff --git a/src/arch/amdgpu/vega/insts/vopc.cc b/src/arch/amdgpu/vega/insts/vopc.cc
new file mode 100644
index 0000000000..2c386fec74
--- /dev/null
+++ b/src/arch/amdgpu/vega/insts/vopc.cc
@@ -0,0 +1,6590 @@
+/*
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/amdgpu/vega/insts/instructions.hh"
+
+namespace gem5
+{
+
+namespace VegaISA
+{
+    // --- Inst_VOPC__V_CMP_CLASS_F32 class methods ---
+
+    Inst_VOPC__V_CMP_CLASS_F32::Inst_VOPC__V_CMP_CLASS_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_class_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_CLASS_F32
+
+    Inst_VOPC__V_CMP_CLASS_F32::~Inst_VOPC__V_CMP_CLASS_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_CLASS_F32
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMP_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_CLASS_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_CLASS_F32::Inst_VOPC__V_CMPX_CLASS_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_class_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_CLASS_F32
+
+    Inst_VOPC__V_CMPX_CLASS_F32::~Inst_VOPC__V_CMPX_CLASS_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_CLASS_F32
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.f The function reports true if the floating point value is *any* of
+    // the numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMPX_CLASS_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane]) && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMP_CLASS_F64 class methods ---
+
+    Inst_VOPC__V_CMP_CLASS_F64::Inst_VOPC__V_CMP_CLASS_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_class_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_CLASS_F64
+
+    Inst_VOPC__V_CMP_CLASS_F64::~Inst_VOPC__V_CMP_CLASS_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_CLASS_F64
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.d
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMP_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_CLASS_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_CLASS_F64::Inst_VOPC__V_CMPX_CLASS_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_class_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_CLASS_F64
+
+    Inst_VOPC__V_CMPX_CLASS_F64::~Inst_VOPC__V_CMPX_CLASS_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_CLASS_F64
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // S0.d The function reports true if the floating point value is *any* of
+    // the numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMPX_CLASS_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                if (bits(src1[lane], 0) || bits(src1[lane], 1)) {
+                    // is NaN
+                    if (std::isnan(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 2)) {
+                    // is -infinity
+                    if (std::isinf(src0[lane]) && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 3)) {
+                    // is -normal
+                    if (std::isnormal(src0[lane])
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 4)) {
+                    // is -denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 5)) {
+                    // is -zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 6)) {
+                    // is +zero
+                    if (std::fpclassify(src0[lane]) == FP_ZERO
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 7)) {
+                    // is +denormal
+                    if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 8)) {
+                    // is +normal
+                    if (std::isnormal(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+                if (bits(src1[lane], 9)) {
+                    // is +infinity
+                    if (std::isinf(src0[lane])
+                        && !std::signbit(src0[lane])) {
+                        vcc.setBit(lane, 1);
+                        continue;
+                    }
+                }
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMP_CLASS_F16 class methods ---
+
+    Inst_VOPC__V_CMP_CLASS_F16::Inst_VOPC__V_CMP_CLASS_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_class_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_CLASS_F16
+
+    Inst_VOPC__V_CMP_CLASS_F16::~Inst_VOPC__V_CMP_CLASS_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_CLASS_F16
+
+    // --- description from .arch file ---
+    // VCC = IEEE numeric class function specified in S1.u, performed on S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMP_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_CLASS_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_CLASS_F16::Inst_VOPC__V_CMPX_CLASS_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_class_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_CLASS_F16
+
+    Inst_VOPC__V_CMPX_CLASS_F16::~Inst_VOPC__V_CMPX_CLASS_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_CLASS_F16
+
+    // --- description from .arch file ---
+    // EXEC, VCC = IEEE numeric class function specified in S1.u, performed on
+    // ---  S0.f16
+    // The function reports true if the floating point value is *any* of the
+    // ---  numeric types selected in S1.u according to the following list:
+    // S1.u[0] -- value is a signaling NaN.
+    // S1.u[1] -- value is a quiet NaN.
+    // S1.u[2] -- value is negative infinity.
+    // S1.u[3] -- value is a negative normal value.
+    // S1.u[4] -- value is a negative denormal value.
+    // S1.u[5] -- value is negative zero.
+    // S1.u[6] -- value is positive zero.
+    // S1.u[7] -- value is a positive denormal value.
+    // S1.u[8] -- value is a positive normal value.
+    // S1.u[9] -- value is positive infinity.
+    void
+    Inst_VOPC__V_CMPX_CLASS_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_F16 class methods ---
+
+    Inst_VOPC__V_CMP_F_F16::Inst_VOPC__V_CMP_F_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_F_F16
+
+    Inst_VOPC__V_CMP_F_F16::~Inst_VOPC__V_CMP_F_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_F_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_LT_F16::Inst_VOPC__V_CMP_LT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_LT_F16
+
+    Inst_VOPC__V_CMP_LT_F16::~Inst_VOPC__V_CMP_LT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_F16 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_F16::Inst_VOPC__V_CMP_EQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_EQ_F16
+
+    Inst_VOPC__V_CMP_EQ_F16::~Inst_VOPC__V_CMP_EQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_LE_F16::Inst_VOPC__V_CMP_LE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_LE_F16
+
+    Inst_VOPC__V_CMP_LE_F16::~Inst_VOPC__V_CMP_LE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_GT_F16::Inst_VOPC__V_CMP_GT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_GT_F16
+
+    Inst_VOPC__V_CMP_GT_F16::~Inst_VOPC__V_CMP_GT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LG_F16 class methods ---
+
+    Inst_VOPC__V_CMP_LG_F16::Inst_VOPC__V_CMP_LG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_LG_F16
+
+    Inst_VOPC__V_CMP_LG_F16::~Inst_VOPC__V_CMP_LG_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_LG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_GE_F16::Inst_VOPC__V_CMP_GE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_GE_F16
+
+    Inst_VOPC__V_CMP_GE_F16::~Inst_VOPC__V_CMP_GE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_O_F16 class methods ---
+
+    Inst_VOPC__V_CMP_O_F16::Inst_VOPC__V_CMP_O_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_o_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_O_F16
+
+    Inst_VOPC__V_CMP_O_F16::~Inst_VOPC__V_CMP_O_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_O_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_U_F16 class methods ---
+
+    Inst_VOPC__V_CMP_U_F16::Inst_VOPC__V_CMP_U_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_u_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_U_F16
+
+    Inst_VOPC__V_CMP_U_F16::~Inst_VOPC__V_CMP_U_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_U_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NGE_F16::Inst_VOPC__V_CMP_NGE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NGE_F16
+
+    Inst_VOPC__V_CMP_NGE_F16::~Inst_VOPC__V_CMP_NGE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NGE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLG_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NLG_F16::Inst_VOPC__V_CMP_NLG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NLG_F16
+
+    Inst_VOPC__V_CMP_NLG_F16::~Inst_VOPC__V_CMP_NLG_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NLG_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NGT_F16::Inst_VOPC__V_CMP_NGT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ngt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NGT_F16
+
+    Inst_VOPC__V_CMP_NGT_F16::~Inst_VOPC__V_CMP_NGT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NGT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLE_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NLE_F16::Inst_VOPC__V_CMP_NLE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nle_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NLE_F16
+
+    Inst_VOPC__V_CMP_NLE_F16::~Inst_VOPC__V_CMP_NLE_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NLE_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NEQ_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NEQ_F16::Inst_VOPC__V_CMP_NEQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_neq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NEQ_F16
+
+    Inst_VOPC__V_CMP_NEQ_F16::~Inst_VOPC__V_CMP_NEQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NEQ_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLT_F16 class methods ---
+
+    Inst_VOPC__V_CMP_NLT_F16::Inst_VOPC__V_CMP_NLT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_NLT_F16
+
+    Inst_VOPC__V_CMP_NLT_F16::~Inst_VOPC__V_CMP_NLT_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_NLT_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_TRU_F16 class methods ---
+
+    Inst_VOPC__V_CMP_TRU_F16::Inst_VOPC__V_CMP_TRU_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_tru_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+    } // Inst_VOPC__V_CMP_TRU_F16
+
+    Inst_VOPC__V_CMP_TRU_F16::~Inst_VOPC__V_CMP_TRU_F16()
+    {
+    } // ~Inst_VOPC__V_CMP_TRU_F16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_F_F16::Inst_VOPC__V_CMPX_F_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_F16
+
+    Inst_VOPC__V_CMPX_F_F16::~Inst_VOPC__V_CMPX_F_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_F16::Inst_VOPC__V_CMPX_LT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_F16
+
+    Inst_VOPC__V_CMPX_LT_F16::~Inst_VOPC__V_CMPX_LT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_F16::Inst_VOPC__V_CMPX_EQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_F16
+
+    Inst_VOPC__V_CMPX_EQ_F16::~Inst_VOPC__V_CMPX_EQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_F16::Inst_VOPC__V_CMPX_LE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_F16
+
+    Inst_VOPC__V_CMPX_LE_F16::~Inst_VOPC__V_CMPX_LE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_F16::Inst_VOPC__V_CMPX_GT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_F16
+
+    Inst_VOPC__V_CMPX_GT_F16::~Inst_VOPC__V_CMPX_GT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LG_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_LG_F16::Inst_VOPC__V_CMPX_LG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LG_F16
+
+    Inst_VOPC__V_CMPX_LG_F16::~Inst_VOPC__V_CMPX_LG_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_F16::Inst_VOPC__V_CMPX_GE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_F16
+
+    Inst_VOPC__V_CMPX_GE_F16::~Inst_VOPC__V_CMPX_GE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_O_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_O_F16::Inst_VOPC__V_CMPX_O_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_o_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_O_F16
+
+    Inst_VOPC__V_CMPX_O_F16::~Inst_VOPC__V_CMPX_O_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_O_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_O_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_U_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_U_F16::Inst_VOPC__V_CMPX_U_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_u_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_U_F16
+
+    Inst_VOPC__V_CMPX_U_F16::~Inst_VOPC__V_CMPX_U_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_U_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_U_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NGE_F16::Inst_VOPC__V_CMPX_NGE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nge_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGE_F16
+
+    Inst_VOPC__V_CMPX_NGE_F16::~Inst_VOPC__V_CMPX_NGE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLG_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NLG_F16::Inst_VOPC__V_CMPX_NLG_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlg_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLG_F16
+
+    Inst_VOPC__V_CMPX_NLG_F16::~Inst_VOPC__V_CMPX_NLG_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLG_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLG_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NGT_F16::Inst_VOPC__V_CMPX_NGT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ngt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGT_F16
+
+    Inst_VOPC__V_CMPX_NGT_F16::~Inst_VOPC__V_CMPX_NGT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLE_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NLE_F16::Inst_VOPC__V_CMPX_NLE_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nle_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLE_F16
+
+    Inst_VOPC__V_CMPX_NLE_F16::~Inst_VOPC__V_CMPX_NLE_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLE_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLE_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NEQ_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NEQ_F16::Inst_VOPC__V_CMPX_NEQ_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_neq_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NEQ_F16
+
+    Inst_VOPC__V_CMPX_NEQ_F16::~Inst_VOPC__V_CMPX_NEQ_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NEQ_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NEQ_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLT_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_NLT_F16::Inst_VOPC__V_CMPX_NLT_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlt_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLT_F16
+
+    Inst_VOPC__V_CMPX_NLT_F16::~Inst_VOPC__V_CMPX_NLT_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLT_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLT_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_TRU_F16 class methods ---
+
+    Inst_VOPC__V_CMPX_TRU_F16::Inst_VOPC__V_CMPX_TRU_F16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_tru_f16")
+    {
+        setFlag(ALU);
+        setFlag(F16);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_TRU_F16
+
+    Inst_VOPC__V_CMPX_TRU_F16::~Inst_VOPC__V_CMPX_TRU_F16()
+    {
+    } // ~Inst_VOPC__V_CMPX_TRU_F16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_TRU_F16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        panicUnimplemented();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_F32 class methods ---
+
+    Inst_VOPC__V_CMP_F_F32::Inst_VOPC__V_CMP_F_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_F_F32
+
+    Inst_VOPC__V_CMP_F_F32::~Inst_VOPC__V_CMP_F_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_F_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_LT_F32::Inst_VOPC__V_CMP_LT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_LT_F32
+
+    Inst_VOPC__V_CMP_LT_F32::~Inst_VOPC__V_CMP_LT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_F32 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_F32::Inst_VOPC__V_CMP_EQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_EQ_F32
+
+    Inst_VOPC__V_CMP_EQ_F32::~Inst_VOPC__V_CMP_EQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_LE_F32::Inst_VOPC__V_CMP_LE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_LE_F32
+
+    Inst_VOPC__V_CMP_LE_F32::~Inst_VOPC__V_CMP_LE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_GT_F32::Inst_VOPC__V_CMP_GT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_GT_F32
+
+    Inst_VOPC__V_CMP_GT_F32::~Inst_VOPC__V_CMP_GT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LG_F32 class methods ---
+
+    Inst_VOPC__V_CMP_LG_F32::Inst_VOPC__V_CMP_LG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_LG_F32
+
+    Inst_VOPC__V_CMP_LG_F32::~Inst_VOPC__V_CMP_LG_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_LG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_GE_F32::Inst_VOPC__V_CMP_GE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_GE_F32
+
+    Inst_VOPC__V_CMP_GE_F32::~Inst_VOPC__V_CMP_GE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_O_F32 class methods ---
+
+    Inst_VOPC__V_CMP_O_F32::Inst_VOPC__V_CMP_O_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_o_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_O_F32
+
+    Inst_VOPC__V_CMP_O_F32::~Inst_VOPC__V_CMP_O_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_O_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_U_F32 class methods ---
+
+    Inst_VOPC__V_CMP_U_F32::Inst_VOPC__V_CMP_U_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_u_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_U_F32
+
+    Inst_VOPC__V_CMP_U_F32::~Inst_VOPC__V_CMP_U_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_U_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NGE_F32::Inst_VOPC__V_CMP_NGE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NGE_F32
+
+    Inst_VOPC__V_CMP_NGE_F32::~Inst_VOPC__V_CMP_NGE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NGE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLG_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NLG_F32::Inst_VOPC__V_CMP_NLG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NLG_F32
+
+    Inst_VOPC__V_CMP_NLG_F32::~Inst_VOPC__V_CMP_NLG_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NLG_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NGT_F32::Inst_VOPC__V_CMP_NGT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ngt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NGT_F32
+
+    Inst_VOPC__V_CMP_NGT_F32::~Inst_VOPC__V_CMP_NGT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NGT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLE_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NLE_F32::Inst_VOPC__V_CMP_NLE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nle_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NLE_F32
+
+    Inst_VOPC__V_CMP_NLE_F32::~Inst_VOPC__V_CMP_NLE_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NLE_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NEQ_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NEQ_F32::Inst_VOPC__V_CMP_NEQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_neq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NEQ_F32
+
+    Inst_VOPC__V_CMP_NEQ_F32::~Inst_VOPC__V_CMP_NEQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NEQ_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLT_F32 class methods ---
+
+    Inst_VOPC__V_CMP_NLT_F32::Inst_VOPC__V_CMP_NLT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_NLT_F32
+
+    Inst_VOPC__V_CMP_NLT_F32::~Inst_VOPC__V_CMP_NLT_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_NLT_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_TRU_F32 class methods ---
+
+    Inst_VOPC__V_CMP_TRU_F32::Inst_VOPC__V_CMP_TRU_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_tru_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+    } // Inst_VOPC__V_CMP_TRU_F32
+
+    Inst_VOPC__V_CMP_TRU_F32::~Inst_VOPC__V_CMP_TRU_F32()
+    {
+    } // ~Inst_VOPC__V_CMP_TRU_F32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_F_F32::Inst_VOPC__V_CMPX_F_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_F32
+
+    Inst_VOPC__V_CMPX_F_F32::~Inst_VOPC__V_CMPX_F_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_F32::Inst_VOPC__V_CMPX_LT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_F32
+
+    Inst_VOPC__V_CMPX_LT_F32::~Inst_VOPC__V_CMPX_LT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_F32::Inst_VOPC__V_CMPX_EQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_F32
+
+    Inst_VOPC__V_CMPX_EQ_F32::~Inst_VOPC__V_CMPX_EQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_F32::Inst_VOPC__V_CMPX_LE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_F32
+
+    Inst_VOPC__V_CMPX_LE_F32::~Inst_VOPC__V_CMPX_LE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_F32::Inst_VOPC__V_CMPX_GT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_F32
+
+    Inst_VOPC__V_CMPX_GT_F32::~Inst_VOPC__V_CMPX_GT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LG_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_LG_F32::Inst_VOPC__V_CMPX_LG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LG_F32
+
+    Inst_VOPC__V_CMPX_LG_F32::~Inst_VOPC__V_CMPX_LG_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_F32::Inst_VOPC__V_CMPX_GE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_F32
+
+    Inst_VOPC__V_CMPX_GE_F32::~Inst_VOPC__V_CMPX_GE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_O_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_O_F32::Inst_VOPC__V_CMPX_O_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_o_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_O_F32
+
+    Inst_VOPC__V_CMPX_O_F32::~Inst_VOPC__V_CMPX_O_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_O_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_O_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_U_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_U_F32::Inst_VOPC__V_CMPX_U_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_u_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_U_F32
+
+    Inst_VOPC__V_CMPX_U_F32::~Inst_VOPC__V_CMPX_U_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_U_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_U_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NGE_F32::Inst_VOPC__V_CMPX_NGE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nge_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGE_F32
+
+    Inst_VOPC__V_CMPX_NGE_F32::~Inst_VOPC__V_CMPX_NGE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLG_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NLG_F32::Inst_VOPC__V_CMPX_NLG_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlg_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLG_F32
+
+    Inst_VOPC__V_CMPX_NLG_F32::~Inst_VOPC__V_CMPX_NLG_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLG_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLG_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NGT_F32::Inst_VOPC__V_CMPX_NGT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ngt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGT_F32
+
+    Inst_VOPC__V_CMPX_NGT_F32::~Inst_VOPC__V_CMPX_NGT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLE_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NLE_F32::Inst_VOPC__V_CMPX_NLE_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nle_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLE_F32
+
+    Inst_VOPC__V_CMPX_NLE_F32::~Inst_VOPC__V_CMPX_NLE_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLE_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLE_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NEQ_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NEQ_F32::Inst_VOPC__V_CMPX_NEQ_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_neq_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NEQ_F32
+
+    Inst_VOPC__V_CMPX_NEQ_F32::~Inst_VOPC__V_CMPX_NEQ_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NEQ_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NEQ_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] == src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLT_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_NLT_F32::Inst_VOPC__V_CMPX_NLT_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlt_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLT_F32
+
+    Inst_VOPC__V_CMPX_NLT_F32::~Inst_VOPC__V_CMPX_NLT_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLT_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLT_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_TRU_F32 class methods ---
+
+    Inst_VOPC__V_CMPX_TRU_F32::Inst_VOPC__V_CMPX_TRU_F32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_tru_f32")
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_TRU_F32
+
+    Inst_VOPC__V_CMPX_TRU_F32::~Inst_VOPC__V_CMPX_TRU_F32()
+    {
+    } // ~Inst_VOPC__V_CMPX_TRU_F32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_TRU_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_F64 class methods ---
+
+    Inst_VOPC__V_CMP_F_F64::Inst_VOPC__V_CMP_F_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_F_F64
+
+    Inst_VOPC__V_CMP_F_F64::~Inst_VOPC__V_CMP_F_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_F_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_LT_F64::Inst_VOPC__V_CMP_LT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_LT_F64
+
+    Inst_VOPC__V_CMP_LT_F64::~Inst_VOPC__V_CMP_LT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_F64 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_F64::Inst_VOPC__V_CMP_EQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_EQ_F64
+
+    Inst_VOPC__V_CMP_EQ_F64::~Inst_VOPC__V_CMP_EQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_LE_F64::Inst_VOPC__V_CMP_LE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_LE_F64
+
+    Inst_VOPC__V_CMP_LE_F64::~Inst_VOPC__V_CMP_LE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_GT_F64::Inst_VOPC__V_CMP_GT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_GT_F64
+
+    Inst_VOPC__V_CMP_GT_F64::~Inst_VOPC__V_CMP_GT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LG_F64 class methods ---
+
+    Inst_VOPC__V_CMP_LG_F64::Inst_VOPC__V_CMP_LG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_LG_F64
+
+    Inst_VOPC__V_CMP_LG_F64::~Inst_VOPC__V_CMP_LG_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_LG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_GE_F64::Inst_VOPC__V_CMP_GE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_GE_F64
+
+    Inst_VOPC__V_CMP_GE_F64::~Inst_VOPC__V_CMP_GE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_O_F64 class methods ---
+
+    Inst_VOPC__V_CMP_O_F64::Inst_VOPC__V_CMP_O_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_o_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_O_F64
+
+    Inst_VOPC__V_CMP_O_F64::~Inst_VOPC__V_CMP_O_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_O_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_U_F64 class methods ---
+
+    Inst_VOPC__V_CMP_U_F64::Inst_VOPC__V_CMP_U_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_u_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_U_F64
+
+    Inst_VOPC__V_CMP_U_F64::~Inst_VOPC__V_CMP_U_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_U_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NGE_F64::Inst_VOPC__V_CMP_NGE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NGE_F64
+
+    Inst_VOPC__V_CMP_NGE_F64::~Inst_VOPC__V_CMP_NGE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NGE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLG_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NLG_F64::Inst_VOPC__V_CMP_NLG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NLG_F64
+
+    Inst_VOPC__V_CMP_NLG_F64::~Inst_VOPC__V_CMP_NLG_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NLG_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NGT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NGT_F64::Inst_VOPC__V_CMP_NGT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ngt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NGT_F64
+
+    Inst_VOPC__V_CMP_NGT_F64::~Inst_VOPC__V_CMP_NGT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NGT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLE_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NLE_F64::Inst_VOPC__V_CMP_NLE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nle_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NLE_F64
+
+    Inst_VOPC__V_CMP_NLE_F64::~Inst_VOPC__V_CMP_NLE_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NLE_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NEQ_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NEQ_F64::Inst_VOPC__V_CMP_NEQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_neq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NEQ_F64
+
+    Inst_VOPC__V_CMP_NEQ_F64::~Inst_VOPC__V_CMP_NEQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NEQ_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NLT_F64 class methods ---
+
+    Inst_VOPC__V_CMP_NLT_F64::Inst_VOPC__V_CMP_NLT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_nlt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_NLT_F64
+
+    Inst_VOPC__V_CMP_NLT_F64::~Inst_VOPC__V_CMP_NLT_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_NLT_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_TRU_F64 class methods ---
+
+    Inst_VOPC__V_CMP_TRU_F64::Inst_VOPC__V_CMP_TRU_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_tru_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+    } // Inst_VOPC__V_CMP_TRU_F64
+
+    Inst_VOPC__V_CMP_TRU_F64::~Inst_VOPC__V_CMP_TRU_F64()
+    {
+    } // ~Inst_VOPC__V_CMP_TRU_F64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_F_F64::Inst_VOPC__V_CMPX_F_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_F64
+
+    Inst_VOPC__V_CMPX_F_F64::~Inst_VOPC__V_CMPX_F_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_F64::Inst_VOPC__V_CMPX_LT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_F64
+
+    Inst_VOPC__V_CMPX_LT_F64::~Inst_VOPC__V_CMPX_LT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_F64::Inst_VOPC__V_CMPX_EQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_F64
+
+    Inst_VOPC__V_CMPX_EQ_F64::~Inst_VOPC__V_CMPX_EQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+        wf->execMask() = vcc.rawData();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_F64::Inst_VOPC__V_CMPX_LE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_F64
+
+    Inst_VOPC__V_CMPX_LE_F64::~Inst_VOPC__V_CMPX_LE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_F64::Inst_VOPC__V_CMPX_GT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_F64
+
+    Inst_VOPC__V_CMPX_GT_F64::~Inst_VOPC__V_CMPX_GT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LG_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_LG_F64::Inst_VOPC__V_CMPX_LG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LG_F64
+
+    Inst_VOPC__V_CMPX_LG_F64::~Inst_VOPC__V_CMPX_LG_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_F64::Inst_VOPC__V_CMPX_GE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_F64
+
+    Inst_VOPC__V_CMPX_GE_F64::~Inst_VOPC__V_CMPX_GE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_O_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_O_F64::Inst_VOPC__V_CMPX_O_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_o_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_O_F64
+
+    Inst_VOPC__V_CMPX_O_F64::~Inst_VOPC__V_CMPX_O_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_O_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (!isNan(S0) && !isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_O_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (!std::isnan(src0[lane])
+                    && !std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_U_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_U_F64::Inst_VOPC__V_CMPX_U_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_u_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_U_F64
+
+    Inst_VOPC__V_CMPX_U_F64::~Inst_VOPC__V_CMPX_U_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_U_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (isNan(S0)  ||  isNan(S1)); D = VCC in VOPC
+    // encoding.
+    void
+    Inst_VOPC__V_CMPX_U_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, (std::isnan(src0[lane])
+                    || std::isnan(src1[lane])) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NGE_F64::Inst_VOPC__V_CMPX_NGE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nge_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGE_F64
+
+    Inst_VOPC__V_CMPX_NGE_F64::~Inst_VOPC__V_CMPX_NGE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] >= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLG_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NLG_F64::Inst_VOPC__V_CMPX_NLG_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlg_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLG_F64
+
+    Inst_VOPC__V_CMPX_NLG_F64::~Inst_VOPC__V_CMPX_NLG_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLG_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLG_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]
+                    || src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NGT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NGT_F64::Inst_VOPC__V_CMPX_NGT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ngt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NGT_F64
+
+    Inst_VOPC__V_CMPX_NGT_F64::~Inst_VOPC__V_CMPX_NGT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NGT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NGT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] > src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLE_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NLE_F64::Inst_VOPC__V_CMPX_NLE_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nle_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLE_F64
+
+    Inst_VOPC__V_CMPX_NLE_F64::~Inst_VOPC__V_CMPX_NLE_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLE_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLE_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] <= src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NEQ_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NEQ_F64::Inst_VOPC__V_CMPX_NEQ_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_neq_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NEQ_F64
+
+    Inst_VOPC__V_CMPX_NEQ_F64::~Inst_VOPC__V_CMPX_NEQ_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NEQ_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NEQ_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NLT_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_NLT_F64::Inst_VOPC__V_CMPX_NLT_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_nlt_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NLT_F64
+
+    Inst_VOPC__V_CMPX_NLT_F64::~Inst_VOPC__V_CMPX_NLT_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NLT_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = !(S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NLT_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, !(src0[lane] < src1[lane]) ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_TRU_F64 class methods ---
+
+    Inst_VOPC__V_CMPX_TRU_F64::Inst_VOPC__V_CMPX_TRU_F64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_tru_f64")
+    {
+        setFlag(ALU);
+        setFlag(F64);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_TRU_F64
+
+    Inst_VOPC__V_CMPX_TRU_F64::~Inst_VOPC__V_CMPX_TRU_F64()
+    {
+    } // ~Inst_VOPC__V_CMPX_TRU_F64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_TRU_F64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_I16 class methods ---
+
+    Inst_VOPC__V_CMP_F_I16::Inst_VOPC__V_CMP_F_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_I16
+
+    Inst_VOPC__V_CMP_F_I16::~Inst_VOPC__V_CMP_F_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_F_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_I16 class methods ---
+
+    Inst_VOPC__V_CMP_LT_I16::Inst_VOPC__V_CMP_LT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_I16
+
+    Inst_VOPC__V_CMP_LT_I16::~Inst_VOPC__V_CMP_LT_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_I16 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_I16::Inst_VOPC__V_CMP_EQ_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_I16
+
+    Inst_VOPC__V_CMP_EQ_I16::~Inst_VOPC__V_CMP_EQ_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_I16 class methods ---
+
+    Inst_VOPC__V_CMP_LE_I16::Inst_VOPC__V_CMP_LE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_I16
+
+    Inst_VOPC__V_CMP_LE_I16::~Inst_VOPC__V_CMP_LE_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_I16 class methods ---
+
+    Inst_VOPC__V_CMP_GT_I16::Inst_VOPC__V_CMP_GT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_I16
+
+    Inst_VOPC__V_CMP_GT_I16::~Inst_VOPC__V_CMP_GT_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_I16 class methods ---
+
+    Inst_VOPC__V_CMP_NE_I16::Inst_VOPC__V_CMP_NE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_I16
+
+    Inst_VOPC__V_CMP_NE_I16::~Inst_VOPC__V_CMP_NE_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_I16 class methods ---
+
+    Inst_VOPC__V_CMP_GE_I16::Inst_VOPC__V_CMP_GE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_I16
+
+    Inst_VOPC__V_CMP_GE_I16::~Inst_VOPC__V_CMP_GE_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_I16 class methods ---
+
+    Inst_VOPC__V_CMP_T_I16::Inst_VOPC__V_CMP_T_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_i16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_I16
+
+    Inst_VOPC__V_CMP_T_I16::~Inst_VOPC__V_CMP_T_I16()
+    {
+    } // ~Inst_VOPC__V_CMP_T_I16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_U16 class methods ---
+
+    Inst_VOPC__V_CMP_F_U16::Inst_VOPC__V_CMP_F_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_U16
+
+    Inst_VOPC__V_CMP_F_U16::~Inst_VOPC__V_CMP_F_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_F_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_U16 class methods ---
+
+    Inst_VOPC__V_CMP_LT_U16::Inst_VOPC__V_CMP_LT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_U16
+
+    Inst_VOPC__V_CMP_LT_U16::~Inst_VOPC__V_CMP_LT_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_U16 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_U16::Inst_VOPC__V_CMP_EQ_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_U16
+
+    Inst_VOPC__V_CMP_EQ_U16::~Inst_VOPC__V_CMP_EQ_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_U16 class methods ---
+
+    Inst_VOPC__V_CMP_LE_U16::Inst_VOPC__V_CMP_LE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_U16
+
+    Inst_VOPC__V_CMP_LE_U16::~Inst_VOPC__V_CMP_LE_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_U16 class methods ---
+
+    Inst_VOPC__V_CMP_GT_U16::Inst_VOPC__V_CMP_GT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_U16
+
+    Inst_VOPC__V_CMP_GT_U16::~Inst_VOPC__V_CMP_GT_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_U16 class methods ---
+
+    Inst_VOPC__V_CMP_NE_U16::Inst_VOPC__V_CMP_NE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_U16
+
+    Inst_VOPC__V_CMP_NE_U16::~Inst_VOPC__V_CMP_NE_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_U16 class methods ---
+
+    Inst_VOPC__V_CMP_GE_U16::Inst_VOPC__V_CMP_GE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_U16
+
+    Inst_VOPC__V_CMP_GE_U16::~Inst_VOPC__V_CMP_GE_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_U16 class methods ---
+
+    Inst_VOPC__V_CMP_T_U16::Inst_VOPC__V_CMP_T_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_u16")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_U16
+
+    Inst_VOPC__V_CMP_T_U16::~Inst_VOPC__V_CMP_T_U16()
+    {
+    } // ~Inst_VOPC__V_CMP_T_U16
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_F_I16::Inst_VOPC__V_CMPX_F_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_I16
+
+    Inst_VOPC__V_CMPX_F_I16::~Inst_VOPC__V_CMPX_F_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_I16::Inst_VOPC__V_CMPX_LT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_I16
+
+    Inst_VOPC__V_CMPX_LT_I16::~Inst_VOPC__V_CMPX_LT_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_I16::Inst_VOPC__V_CMPX_EQ_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_I16
+
+    Inst_VOPC__V_CMPX_EQ_I16::~Inst_VOPC__V_CMPX_EQ_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_I16::Inst_VOPC__V_CMPX_LE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_I16
+
+    Inst_VOPC__V_CMPX_LE_I16::~Inst_VOPC__V_CMPX_LE_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_I16::Inst_VOPC__V_CMPX_GT_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_I16
+
+    Inst_VOPC__V_CMPX_GT_I16::~Inst_VOPC__V_CMPX_GT_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_I16::Inst_VOPC__V_CMPX_NE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_I16
+
+    Inst_VOPC__V_CMPX_NE_I16::~Inst_VOPC__V_CMPX_NE_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_I16::Inst_VOPC__V_CMPX_GE_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_I16
+
+    Inst_VOPC__V_CMPX_GE_I16::~Inst_VOPC__V_CMPX_GE_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_I16 class methods ---
+
+    Inst_VOPC__V_CMPX_T_I16::Inst_VOPC__V_CMPX_T_I16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_i16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_I16
+
+    Inst_VOPC__V_CMPX_T_I16::~Inst_VOPC__V_CMPX_T_I16()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_I16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_I16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_F_U16::Inst_VOPC__V_CMPX_F_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_U16
+
+    Inst_VOPC__V_CMPX_F_U16::~Inst_VOPC__V_CMPX_F_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_U16::Inst_VOPC__V_CMPX_LT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_U16
+
+    Inst_VOPC__V_CMPX_LT_U16::~Inst_VOPC__V_CMPX_LT_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_U16::Inst_VOPC__V_CMPX_EQ_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_U16
+
+    Inst_VOPC__V_CMPX_EQ_U16::~Inst_VOPC__V_CMPX_EQ_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_U16::Inst_VOPC__V_CMPX_LE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_U16
+
+    Inst_VOPC__V_CMPX_LE_U16::~Inst_VOPC__V_CMPX_LE_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_U16::Inst_VOPC__V_CMPX_GT_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_U16
+
+    Inst_VOPC__V_CMPX_GT_U16::~Inst_VOPC__V_CMPX_GT_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_U16::Inst_VOPC__V_CMPX_NE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_U16
+
+    Inst_VOPC__V_CMPX_NE_U16::~Inst_VOPC__V_CMPX_NE_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_U16::Inst_VOPC__V_CMPX_GE_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_U16
+
+    Inst_VOPC__V_CMPX_GE_U16::~Inst_VOPC__V_CMPX_GE_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_U16 class methods ---
+
+    Inst_VOPC__V_CMPX_T_U16::Inst_VOPC__V_CMPX_T_U16(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_u16")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_U16
+
+    Inst_VOPC__V_CMPX_T_U16::~Inst_VOPC__V_CMPX_T_U16()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_U16
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_U16::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_I32 class methods ---
+
+    Inst_VOPC__V_CMP_F_I32::Inst_VOPC__V_CMP_F_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_I32
+
+    Inst_VOPC__V_CMP_F_I32::~Inst_VOPC__V_CMP_F_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_F_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_I32 class methods ---
+
+    Inst_VOPC__V_CMP_LT_I32::Inst_VOPC__V_CMP_LT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_I32
+
+    Inst_VOPC__V_CMP_LT_I32::~Inst_VOPC__V_CMP_LT_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_I32 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_I32::Inst_VOPC__V_CMP_EQ_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_I32
+
+    Inst_VOPC__V_CMP_EQ_I32::~Inst_VOPC__V_CMP_EQ_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_I32 class methods ---
+
+    Inst_VOPC__V_CMP_LE_I32::Inst_VOPC__V_CMP_LE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_I32
+
+    Inst_VOPC__V_CMP_LE_I32::~Inst_VOPC__V_CMP_LE_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_I32 class methods ---
+
+    Inst_VOPC__V_CMP_GT_I32::Inst_VOPC__V_CMP_GT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_I32
+
+    Inst_VOPC__V_CMP_GT_I32::~Inst_VOPC__V_CMP_GT_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_I32 class methods ---
+
+    Inst_VOPC__V_CMP_NE_I32::Inst_VOPC__V_CMP_NE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_I32
+
+    Inst_VOPC__V_CMP_NE_I32::~Inst_VOPC__V_CMP_NE_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_I32 class methods ---
+
+    Inst_VOPC__V_CMP_GE_I32::Inst_VOPC__V_CMP_GE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_I32
+
+    Inst_VOPC__V_CMP_GE_I32::~Inst_VOPC__V_CMP_GE_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_I32 class methods ---
+
+    Inst_VOPC__V_CMP_T_I32::Inst_VOPC__V_CMP_T_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_i32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_I32
+
+    Inst_VOPC__V_CMP_T_I32::~Inst_VOPC__V_CMP_T_I32()
+    {
+    } // ~Inst_VOPC__V_CMP_T_I32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_U32 class methods ---
+
+    Inst_VOPC__V_CMP_F_U32::Inst_VOPC__V_CMP_F_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_U32
+
+    Inst_VOPC__V_CMP_F_U32::~Inst_VOPC__V_CMP_F_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_F_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_U32 class methods ---
+
+    Inst_VOPC__V_CMP_LT_U32::Inst_VOPC__V_CMP_LT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_U32
+
+    Inst_VOPC__V_CMP_LT_U32::~Inst_VOPC__V_CMP_LT_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_U32 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_U32::Inst_VOPC__V_CMP_EQ_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_U32
+
+    Inst_VOPC__V_CMP_EQ_U32::~Inst_VOPC__V_CMP_EQ_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_U32 class methods ---
+
+    Inst_VOPC__V_CMP_LE_U32::Inst_VOPC__V_CMP_LE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_U32
+
+    Inst_VOPC__V_CMP_LE_U32::~Inst_VOPC__V_CMP_LE_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_U32 class methods ---
+
+    Inst_VOPC__V_CMP_GT_U32::Inst_VOPC__V_CMP_GT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_U32
+
+    Inst_VOPC__V_CMP_GT_U32::~Inst_VOPC__V_CMP_GT_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_U32 class methods ---
+
+    Inst_VOPC__V_CMP_NE_U32::Inst_VOPC__V_CMP_NE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_U32
+
+    Inst_VOPC__V_CMP_NE_U32::~Inst_VOPC__V_CMP_NE_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_U32 class methods ---
+
+    Inst_VOPC__V_CMP_GE_U32::Inst_VOPC__V_CMP_GE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_U32
+
+    Inst_VOPC__V_CMP_GE_U32::~Inst_VOPC__V_CMP_GE_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_U32 class methods ---
+
+    Inst_VOPC__V_CMP_T_U32::Inst_VOPC__V_CMP_T_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_u32")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_U32
+
+    Inst_VOPC__V_CMP_T_U32::~Inst_VOPC__V_CMP_T_U32()
+    {
+    } // ~Inst_VOPC__V_CMP_T_U32
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_F_I32::Inst_VOPC__V_CMPX_F_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_I32
+
+    Inst_VOPC__V_CMPX_F_I32::~Inst_VOPC__V_CMPX_F_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_I32::Inst_VOPC__V_CMPX_LT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_I32
+
+    Inst_VOPC__V_CMPX_LT_I32::~Inst_VOPC__V_CMPX_LT_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_I32::Inst_VOPC__V_CMPX_EQ_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_I32
+
+    Inst_VOPC__V_CMPX_EQ_I32::~Inst_VOPC__V_CMPX_EQ_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_I32::Inst_VOPC__V_CMPX_LE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_I32
+
+    Inst_VOPC__V_CMPX_LE_I32::~Inst_VOPC__V_CMPX_LE_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_I32::Inst_VOPC__V_CMPX_GT_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_I32
+
+    Inst_VOPC__V_CMPX_GT_I32::~Inst_VOPC__V_CMPX_GT_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_I32::Inst_VOPC__V_CMPX_NE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_I32
+
+    Inst_VOPC__V_CMPX_NE_I32::~Inst_VOPC__V_CMPX_NE_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_I32::Inst_VOPC__V_CMPX_GE_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_I32
+
+    Inst_VOPC__V_CMPX_GE_I32::~Inst_VOPC__V_CMPX_GE_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_I32 class methods ---
+
+    Inst_VOPC__V_CMPX_T_I32::Inst_VOPC__V_CMPX_T_I32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_i32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_I32
+
+    Inst_VOPC__V_CMPX_T_I32::~Inst_VOPC__V_CMPX_T_I32()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_I32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_I32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_F_U32::Inst_VOPC__V_CMPX_F_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_U32
+
+    Inst_VOPC__V_CMPX_F_U32::~Inst_VOPC__V_CMPX_F_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_U32::Inst_VOPC__V_CMPX_LT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_U32
+
+    Inst_VOPC__V_CMPX_LT_U32::~Inst_VOPC__V_CMPX_LT_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_U32::Inst_VOPC__V_CMPX_EQ_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_U32
+
+    Inst_VOPC__V_CMPX_EQ_U32::~Inst_VOPC__V_CMPX_EQ_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_U32::Inst_VOPC__V_CMPX_LE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_U32
+
+    Inst_VOPC__V_CMPX_LE_U32::~Inst_VOPC__V_CMPX_LE_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_U32::Inst_VOPC__V_CMPX_GT_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_U32
+
+    Inst_VOPC__V_CMPX_GT_U32::~Inst_VOPC__V_CMPX_GT_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_U32::Inst_VOPC__V_CMPX_NE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_U32
+
+    Inst_VOPC__V_CMPX_NE_U32::~Inst_VOPC__V_CMPX_NE_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_U32::Inst_VOPC__V_CMPX_GE_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_U32
+
+    Inst_VOPC__V_CMPX_GE_U32::~Inst_VOPC__V_CMPX_GE_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_U32 class methods ---
+
+    Inst_VOPC__V_CMPX_T_U32::Inst_VOPC__V_CMPX_T_U32(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_u32")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_U32
+
+    Inst_VOPC__V_CMPX_T_U32::~Inst_VOPC__V_CMPX_T_U32()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_U32
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_I64 class methods ---
+
+    Inst_VOPC__V_CMP_F_I64::Inst_VOPC__V_CMP_F_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_I64
+
+    Inst_VOPC__V_CMP_F_I64::~Inst_VOPC__V_CMP_F_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_F_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_I64 class methods ---
+
+    Inst_VOPC__V_CMP_LT_I64::Inst_VOPC__V_CMP_LT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_I64
+
+    Inst_VOPC__V_CMP_LT_I64::~Inst_VOPC__V_CMP_LT_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_I64 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_I64::Inst_VOPC__V_CMP_EQ_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_I64
+
+    Inst_VOPC__V_CMP_EQ_I64::~Inst_VOPC__V_CMP_EQ_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_I64 class methods ---
+
+    Inst_VOPC__V_CMP_LE_I64::Inst_VOPC__V_CMP_LE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_I64
+
+    Inst_VOPC__V_CMP_LE_I64::~Inst_VOPC__V_CMP_LE_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_I64 class methods ---
+
+    Inst_VOPC__V_CMP_GT_I64::Inst_VOPC__V_CMP_GT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_I64
+
+    Inst_VOPC__V_CMP_GT_I64::~Inst_VOPC__V_CMP_GT_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_I64 class methods ---
+
+    Inst_VOPC__V_CMP_NE_I64::Inst_VOPC__V_CMP_NE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_I64
+
+    Inst_VOPC__V_CMP_NE_I64::~Inst_VOPC__V_CMP_NE_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_I64 class methods ---
+
+    Inst_VOPC__V_CMP_GE_I64::Inst_VOPC__V_CMP_GE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_I64
+
+    Inst_VOPC__V_CMP_GE_I64::~Inst_VOPC__V_CMP_GE_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_I64 class methods ---
+
+    Inst_VOPC__V_CMP_T_I64::Inst_VOPC__V_CMP_T_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_i64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_I64
+
+    Inst_VOPC__V_CMP_T_I64::~Inst_VOPC__V_CMP_T_I64()
+    {
+    } // ~Inst_VOPC__V_CMP_T_I64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_F_U64 class methods ---
+
+    Inst_VOPC__V_CMP_F_U64::Inst_VOPC__V_CMP_F_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_f_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_F_U64
+
+    Inst_VOPC__V_CMP_F_U64::~Inst_VOPC__V_CMP_F_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_F_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LT_U64 class methods ---
+
+    Inst_VOPC__V_CMP_LT_U64::Inst_VOPC__V_CMP_LT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_lt_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LT_U64
+
+    Inst_VOPC__V_CMP_LT_U64::~Inst_VOPC__V_CMP_LT_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_LT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_EQ_U64 class methods ---
+
+    Inst_VOPC__V_CMP_EQ_U64::Inst_VOPC__V_CMP_EQ_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_eq_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_EQ_U64
+
+    Inst_VOPC__V_CMP_EQ_U64::~Inst_VOPC__V_CMP_EQ_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_EQ_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_LE_U64 class methods ---
+
+    Inst_VOPC__V_CMP_LE_U64::Inst_VOPC__V_CMP_LE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_le_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_LE_U64
+
+    Inst_VOPC__V_CMP_LE_U64::~Inst_VOPC__V_CMP_LE_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_LE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GT_U64 class methods ---
+
+    Inst_VOPC__V_CMP_GT_U64::Inst_VOPC__V_CMP_GT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_gt_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GT_U64
+
+    Inst_VOPC__V_CMP_GT_U64::~Inst_VOPC__V_CMP_GT_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_GT_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_NE_U64 class methods ---
+
+    Inst_VOPC__V_CMP_NE_U64::Inst_VOPC__V_CMP_NE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ne_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_NE_U64
+
+    Inst_VOPC__V_CMP_NE_U64::~Inst_VOPC__V_CMP_NE_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_NE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_GE_U64 class methods ---
+
+    Inst_VOPC__V_CMP_GE_U64::Inst_VOPC__V_CMP_GE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_ge_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_GE_U64
+
+    Inst_VOPC__V_CMP_GE_U64::~Inst_VOPC__V_CMP_GE_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_GE_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMP_T_U64 class methods ---
+
+    Inst_VOPC__V_CMP_T_U64::Inst_VOPC__V_CMP_T_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmp_t_u64")
+    {
+        setFlag(ALU);
+    } // Inst_VOPC__V_CMP_T_U64
+
+    Inst_VOPC__V_CMP_T_U64::~Inst_VOPC__V_CMP_T_U64()
+    {
+    } // ~Inst_VOPC__V_CMP_T_U64
+
+    // --- description from .arch file ---
+    // D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMP_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_F_I64::Inst_VOPC__V_CMPX_F_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_I64
+
+    Inst_VOPC__V_CMPX_F_I64::~Inst_VOPC__V_CMPX_F_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_I64::Inst_VOPC__V_CMPX_LT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_I64
+
+    Inst_VOPC__V_CMPX_LT_I64::~Inst_VOPC__V_CMPX_LT_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_I64::Inst_VOPC__V_CMPX_EQ_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_I64
+
+    Inst_VOPC__V_CMPX_EQ_I64::~Inst_VOPC__V_CMPX_EQ_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_I64::Inst_VOPC__V_CMPX_LE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_I64
+
+    Inst_VOPC__V_CMPX_LE_I64::~Inst_VOPC__V_CMPX_LE_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_I64::Inst_VOPC__V_CMPX_GT_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_I64
+
+    Inst_VOPC__V_CMPX_GT_I64::~Inst_VOPC__V_CMPX_GT_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_I64::Inst_VOPC__V_CMPX_NE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_I64
+
+    Inst_VOPC__V_CMPX_NE_I64::~Inst_VOPC__V_CMPX_NE_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_I64::Inst_VOPC__V_CMPX_GE_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_I64
+
+    Inst_VOPC__V_CMPX_GE_I64::~Inst_VOPC__V_CMPX_GE_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandI64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandI64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_I64 class methods ---
+
+    Inst_VOPC__V_CMPX_T_I64::Inst_VOPC__V_CMPX_T_I64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_i64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_I64
+
+    Inst_VOPC__V_CMPX_T_I64::~Inst_VOPC__V_CMPX_T_I64()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_I64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_I64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_F_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_F_U64::Inst_VOPC__V_CMPX_F_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_f_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_F_U64
+
+    Inst_VOPC__V_CMPX_F_U64::~Inst_VOPC__V_CMPX_F_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_F_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 0; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_F_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LT_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_LT_U64::Inst_VOPC__V_CMPX_LT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_lt_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LT_U64
+
+    Inst_VOPC__V_CMPX_LT_U64::~Inst_VOPC__V_CMPX_LT_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 < S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] < src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_EQ_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_EQ_U64::Inst_VOPC__V_CMPX_EQ_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_eq_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_EQ_U64
+
+    Inst_VOPC__V_CMPX_EQ_U64::~Inst_VOPC__V_CMPX_EQ_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_EQ_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 == S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_EQ_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] == src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_LE_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_LE_U64::Inst_VOPC__V_CMPX_LE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_le_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_LE_U64
+
+    Inst_VOPC__V_CMPX_LE_U64::~Inst_VOPC__V_CMPX_LE_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_LE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_LE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] <= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GT_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_GT_U64::Inst_VOPC__V_CMPX_GT_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_gt_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GT_U64
+
+    Inst_VOPC__V_CMPX_GT_U64::~Inst_VOPC__V_CMPX_GT_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GT_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 > S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GT_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_NE_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_NE_U64::Inst_VOPC__V_CMPX_NE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ne_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_NE_U64
+
+    Inst_VOPC__V_CMPX_NE_U64::~Inst_VOPC__V_CMPX_NE_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_NE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 <> S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_NE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] != src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_GE_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_GE_U64::Inst_VOPC__V_CMPX_GE_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_ge_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_GE_U64
+
+    Inst_VOPC__V_CMPX_GE_U64::~Inst_VOPC__V_CMPX_GE_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_GE_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = (S0 >= S1); D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_GE_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU64 src1(gpuDynInst, instData.VSRC1);
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, src0[lane] >= src1[lane] ? 1 : 0);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+    // --- Inst_VOPC__V_CMPX_T_U64 class methods ---
+
+    Inst_VOPC__V_CMPX_T_U64::Inst_VOPC__V_CMPX_T_U64(InFmt_VOPC *iFmt)
+        : Inst_VOPC(iFmt, "v_cmpx_t_u64")
+    {
+        setFlag(ALU);
+        setFlag(WritesEXEC);
+    } // Inst_VOPC__V_CMPX_T_U64
+
+    Inst_VOPC__V_CMPX_T_U64::~Inst_VOPC__V_CMPX_T_U64()
+    {
+    } // ~Inst_VOPC__V_CMPX_T_U64
+
+    // --- description from .arch file ---
+    // EXEC,D.u64[threadID] = 1; D = VCC in VOPC encoding.
+    void
+    Inst_VOPC__V_CMPX_T_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vcc.setBit(lane, 1);
+            }
+        }
+
+        wf->execMask() = vcc.rawData();
+        vcc.write();
+    } // execute
+} // namespace VegaISA
+} // namespace gem5
diff --git a/src/arch/amdgpu/vega/operand.hh b/src/arch/amdgpu/vega/operand.hh
index 1760bd7213..1bb9b43d1f 100644
--- a/src/arch/amdgpu/vega/operand.hh
+++ b/src/arch/amdgpu/vega/operand.hh
@@ -37,6 +37,7 @@
 #include "arch/amdgpu/vega/gpu_registers.hh"
 #include "arch/generic/vec_reg.hh"
 #include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/shader.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
 
@@ -489,7 +490,7 @@ namespace VegaISA
         typename std::enable_if<Condition, void>::type
         setBit(int bit, int bit_val)
         {
-            DataType &sgpr = *((DataType*)srfData.data());
+            GEM5_ALIGNED(8) DataType &sgpr = *((DataType*)srfData.data());
             replaceBits(sgpr, bit, bit_val);
         }
 
@@ -513,15 +514,49 @@ namespace VegaISA
         {
             assert(NumDwords == 1 || NumDwords == 2);
 
+            if (_opIdx >= REG_INT_CONST_POS_MIN &&
+                _opIdx <= REG_INT_CONST_NEG_MAX) {
+                assert(sizeof(DataType) <= sizeof(srfData));
+                DataType misc_val(0);
+                assert(isConstVal(_opIdx));
+                misc_val = (DataType)_gpuDynInst
+                    ->readConstVal<DataType>(_opIdx);
+                std::memcpy((void*)srfData.data(), (void*)&misc_val,
+                            sizeof(DataType));
+
+                return;
+            }
+
+            if (_opIdx == REG_M0 || _opIdx == REG_ZERO || _opIdx == REG_SCC) {
+                assert(sizeof(DataType) <= sizeof(srfData));
+                DataType misc_val(0);
+                misc_val = (DataType)_gpuDynInst->readMiscReg(_opIdx);
+                std::memcpy((void*)srfData.data(), (void*)&misc_val,
+                            sizeof(DataType));
+
+                return;
+            }
+
             switch(_opIdx) {
               case REG_EXEC_LO:
                 {
-                    ScalarRegU64 exec_mask = _gpuDynInst->wavefront()->
-                        execMask().to_ullong();
-                    std::memcpy((void*)srfData.data(), (void*)&exec_mask,
-                        sizeof(exec_mask));
-                    DPRINTF(GPUSRF, "Read EXEC\n");
-                    DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
+                    if constexpr (NumDwords == 2) {
+                        ScalarRegU64 exec_mask = _gpuDynInst->wavefront()->
+                            execMask().to_ullong();
+                        std::memcpy((void*)srfData.data(), (void*)&exec_mask,
+                            sizeof(exec_mask));
+                        DPRINTF(GPUSRF, "Read EXEC\n");
+                        DPRINTF(GPUSRF, "EXEC = %#x\n", exec_mask);
+                    } else {
+                        ScalarRegU64 exec_mask = _gpuDynInst->wavefront()->
+                            execMask().to_ullong();
+
+                        ScalarRegU32 exec_mask_lo = bits(exec_mask, 31, 0);
+                        std::memcpy((void*)srfData.data(),
+                            (void*)&exec_mask_lo, sizeof(exec_mask_lo));
+                        DPRINTF(GPUSRF, "Read EXEC_LO\n");
+                        DPRINTF(GPUSRF, "EXEC_LO = %#x\n", exec_mask_lo);
+                    }
                 }
                 break;
               case REG_EXEC_HI:
@@ -544,8 +579,83 @@ namespace VegaISA
               case REG_SRC_SWDA:
               case REG_SRC_DPP:
               case REG_SRC_LITERAL:
-                assert(NumDwords == 1);
+                /**
+                 * From the Vega specification:
+                 * When a literal constant is used with a 64 bit instruction,
+                 * the literal is expanded to 64 bits by: padding the LSBs
+                 * with zeros for floats, padding the MSBs with zeros for
+                 * unsigned ints, and by sign-extending signed ints.
+                 */
                 srfData[0] = _gpuDynInst->srcLiteral();
+                if constexpr (NumDwords == 2) {
+                    if constexpr (std::is_integral_v<DataType>) {
+                        if constexpr (std::is_signed_v<DataType>) {
+                            if (bits(srfData[0], 31, 31) == 1) {
+                                srfData[1] = 0xffffffff;
+                            } else {
+                                srfData[1] = 0;
+                            }
+                        } else {
+                            srfData[1] = 0;
+                        }
+                    } else {
+                        srfData[1] = _gpuDynInst->srcLiteral();
+                        srfData[0] = 0;
+                    }
+                }
+                break;
+              case REG_SHARED_BASE:
+                {
+                    assert(NumDwords == 2);
+                    if constexpr (NumDwords == 2) {
+                        ComputeUnit *cu = _gpuDynInst->computeUnit();
+                        ScalarRegU64 shared_base = cu->shader->ldsApe().base;
+                        std::memcpy((void*)srfData.data(), (void*)&shared_base,
+                                sizeof(srfData));
+                        DPRINTF(GPUSRF, "Read SHARED_BASE = %#x\n",
+                                shared_base);
+                    }
+                }
+                break;
+              case REG_SHARED_LIMIT:
+                {
+                    assert(NumDwords == 2);
+                    if constexpr (NumDwords == 2) {
+                        ComputeUnit *cu = _gpuDynInst->computeUnit();
+                        ScalarRegU64 shared_limit = cu->shader->ldsApe().limit;
+                        std::memcpy((void*)srfData.data(),
+                                (void*)&shared_limit, sizeof(srfData));
+                        DPRINTF(GPUSRF, "Read SHARED_LIMIT = %#x\n",
+                                shared_limit);
+                    }
+                }
+                break;
+              case REG_PRIVATE_BASE:
+                {
+                    assert(NumDwords == 2);
+                    if constexpr (NumDwords == 2) {
+                        ComputeUnit *cu = _gpuDynInst->computeUnit();
+                        ScalarRegU64 priv_base = cu->shader->scratchApe().base;
+                        std::memcpy((void*)srfData.data(), (void*)&priv_base,
+                                sizeof(srfData));
+                        DPRINTF(GPUSRF, "Read PRIVATE_BASE = %#x\n",
+                                priv_base);
+                    }
+                }
+                break;
+              case REG_PRIVATE_LIMIT:
+                {
+                    assert(NumDwords == 2);
+                    if constexpr (NumDwords == 2) {
+                        ComputeUnit *cu = _gpuDynInst->computeUnit();
+                        ScalarRegU64 priv_limit =
+                            cu->shader->scratchApe().limit;
+                        std::memcpy((void*)srfData.data(), (void*)&priv_limit,
+                                sizeof(srfData));
+                        DPRINTF(GPUSRF, "Read PRIVATE_LIMIT = %#x\n",
+                                priv_limit);
+                    }
+                }
                 break;
               case REG_POS_HALF:
                 {
@@ -617,18 +727,8 @@ namespace VegaISA
                 }
                 break;
               default:
-                {
-                    assert(sizeof(DataType) <= sizeof(srfData));
-                    DataType misc_val(0);
-                    if (isConstVal(_opIdx)) {
-                        misc_val = (DataType)_gpuDynInst
-                            ->readConstVal<DataType>(_opIdx);
-                    } else {
-                        misc_val = (DataType)_gpuDynInst->readMiscReg(_opIdx);
-                    }
-                    std::memcpy((void*)srfData.data(), (void*)&misc_val,
-                                sizeof(DataType));
-                }
+                panic("Invalid special register index: %d\n", _opIdx);
+                break;
             }
         }
 
@@ -674,7 +774,7 @@ namespace VegaISA
          * of a register is 1 dword. this class will take care to do the
          * proper packing/unpacking of sub-dword operands.
          */
-        std::array<ScalarRegU32, NumDwords> srfData;
+        GEM5_ALIGNED(8) std::array<ScalarRegU32, NumDwords> srfData;
     };
 
     // typedefs for the various sizes/types of scalar operands
@@ -735,6 +835,142 @@ namespace VegaISA
     using ConstVecOperandU128 = VecOperand<VecElemU32, true, 4>;
     using ConstVecOperandU256 = VecOperand<VecElemU32, true, 8>;
     using ConstVecOperandU512 = VecOperand<VecElemU32, true, 16>;
+
+
+// Helper class for using multiple VecElemU32 to represent data types which
+// do not divide a dword evenly.
+template<int BITS, int ELEM_SIZE>
+class PackedReg
+{
+    // Logical view is:
+    // dword N, dword N - 1, ..., dword 1, dword 0.
+    // Within each dword, the element starts at [ELEM_SIZE:0]. For example,
+    // for ELEM_SIZE = 6 for fp6 types, [5:0] is the first value, [11:6] is
+    // the second, and so forth. For 6 bits specifically, the 6th element
+    // spans dword 0 and dword 1.
+    static_assert(BITS % 32 == 0);
+    static_assert(BITS % ELEM_SIZE == 0);
+    static_assert(ELEM_SIZE <= 32);
+
+    static constexpr int NumDwords = BITS / 32;
+    uint32_t dwords[NumDwords] = {};
+
+  public:
+    PackedReg() = default;
+
+    void
+    setDword(int dw, uint32_t value)
+    {
+        assert(dw < NumDwords);
+        dwords[dw] = value;
+    }
+
+    uint32_t
+    getDword(int dw)
+    {
+        assert(dw < NumDwords);
+        return dwords[dw];
+    }
+
+    uint32_t
+    getElem(int elem)
+    {
+        assert(elem < (BITS / ELEM_SIZE));
+
+        // Get the upper/lower *bit* location of the element.
+        int ubit, lbit;
+        ubit = elem * ELEM_SIZE + (ELEM_SIZE - 1);
+        lbit = elem * ELEM_SIZE;
+
+        // Convert the bit locations to upper/lower dwords. It is possible
+        // to span two dwords but this does not have to support spanning
+        // more than two dwords.
+        int udw, ldw;
+        udw = ubit / 32;
+        ldw = lbit / 32;
+        assert(udw == ldw || udw == ldw + 1);
+
+        if (udw == ldw) {
+            // Easy case, just shift the dword value and mask to get value.
+            int dw_lbit = lbit % 32;
+
+            uint32_t elem_mask = (1UL << ELEM_SIZE) - 1;
+            uint32_t rv = (dwords[ldw] >> dw_lbit) & elem_mask;
+
+            return rv;
+        }
+
+        // Harder case. To make it easier put into a quad word and shift
+        // that variable instead of trying to work with two.
+        uint64_t qword =
+            uint64_t(dwords[udw]) << 32 | uint64_t(dwords[ldw]);
+
+        int qw_lbit = lbit % 32;
+
+        uint64_t elem_mask = (1ULL << ELEM_SIZE) - 1;
+        uint32_t rv = uint32_t((qword >> qw_lbit) & elem_mask);
+
+        return rv;
+    }
+
+    void
+    setElem(int elem, uint32_t value)
+    {
+        assert(elem < (BITS / ELEM_SIZE));
+
+        // Get the upper/lower *bit* location of the element.
+        int ubit, lbit;
+        ubit = elem * ELEM_SIZE + (ELEM_SIZE - 1);
+        lbit = elem * ELEM_SIZE;
+
+        // Convert the bit locations to upper/lower dwords. It is possible
+        // to span two dwords but this does not have to support spanning
+        // more than two dwords.
+        int udw, ldw;
+        udw = ubit / 32;
+        ldw = lbit / 32;
+        assert(udw == ldw || udw == ldw + 1);
+
+        if (udw == ldw) {
+            // Easy case, just shift the dword value and mask to get value.
+            int dw_lbit = lbit % 32;
+
+            // Make sure the value is not going to clobber another element.
+            uint32_t elem_mask = (1UL << ELEM_SIZE) - 1;
+            value &= elem_mask;
+
+            // Clear the bits we are setting.
+            elem_mask <<= dw_lbit;
+            dwords[ldw] &= ~elem_mask;
+
+            value <<= dw_lbit;
+            dwords[ldw] |= value;
+
+            return;
+        }
+
+        // Harder case. Put the two dwords in a quad word and manipulate that.
+        // Then place the two new dwords back into the storage.
+        uint64_t qword =
+            uint64_t(dwords[udw]) << 32 | uint64_t(dwords[ldw]);
+
+        int qw_lbit = lbit % 32;
+
+        // Make sure the value is not going to clobber another element.
+        uint64_t elem_mask = (1ULL << ELEM_SIZE) - 1;
+        value &= elem_mask;
+
+        elem_mask <<= qw_lbit;
+        qword &= elem_mask;
+
+        value <<= qw_lbit;
+        qword |= value;
+
+        dwords[udw] = uint32_t(qword >> 32);
+        dwords[ldw] = uint32_t(qword & mask(32));
+    }
+};
+
 }
 
 } // namespace gem5
diff --git a/src/arch/arm/ArmISA.py b/src/arch/arm/ArmISA.py
index 8e8d2b641c..8b9cf25b25 100644
--- a/src/arch/arm/ArmISA.py
+++ b/src/arch/arm/ArmISA.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012-2013, 2015-2022 ARM Limited
+# Copyright (c) 2012-2013, 2015-2022, 2024 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -52,7 +52,11 @@ class DecoderFlavor(Enum):
 
 class ArmDefaultSERelease(ArmRelease):
     extensions = [
-        "CRYPTO",
+        "FEAT_AES",
+        "FEAT_PMULL",
+        "FEAT_SHA1",
+        "FEAT_SHA256",
+        "FEAT_CRC32",
         # Armv8.1
         "FEAT_LSE",
         "FEAT_RDM",
@@ -166,6 +170,10 @@ class ArmISA(BaseISA):
         0x0000000000010010, "AArch64 Memory Model Feature Register 2"
     )
 
+    # HAS_SDEFLT | HAS_FORCE_NS | HAS_TIDR | PMG_MAX = 128 |
+    # VPMR_MAX = 7 | HAS_HCR | PARTID_MAX = 256
+    mpamidr_el1 = Param.UInt64(0x34000080001E0100, "MPAM ID Register (EL1)")
+
     # Any access (read/write) to an unimplemented
     # Implementation Defined registers is not causing an Undefined Instruction.
     # It is rather executed as a NOP.
diff --git a/src/arch/arm/ArmSemihosting.py b/src/arch/arm/ArmSemihosting.py
index 8c8375e208..ffc285fc8f 100644
--- a/src/arch/arm/ArmSemihosting.py
+++ b/src/arch/arm/ArmSemihosting.py
@@ -33,36 +33,10 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from m5.objects.Serial import SerialDevice
-from m5.objects.Terminal import Terminal
-from m5.params import *
-from m5.SimObject import *
+from m5.objects.BaseSemihosting import BaseSemihosting
 
 
-class ArmSemihosting(SimObject):
+class ArmSemihosting(BaseSemihosting):
     type = "ArmSemihosting"
     cxx_header = "arch/arm/semihosting.hh"
     cxx_class = "gem5::ArmSemihosting"
-
-    cmd_line = Param.String("", "Command line to report to guest")
-    stdin = Param.String("stdin", "Standard input (stdin for gem5's terminal)")
-    stdout = Param.String(
-        "stdout", "Standard output (stdout for gem5's terminal)"
-    )
-    stderr = Param.String(
-        "stderr", "Standard error (stderr for gem5's terminal)"
-    )
-    files_root_dir = Param.String(
-        "", "Host root directory for files handled by Semihosting"
-    )
-
-    mem_reserve = Param.MemorySize(
-        "32MiB",
-        "Amount of memory to reserve at the start of the address map. This "
-        "memory won't be used by the heap reported to an application.",
-    )
-    stack_size = Param.MemorySize("32MiB", "Application stack size")
-
-    time = Param.Time(
-        "01/01/2009", "System time to use ('Now' for actual time)"
-    )
diff --git a/src/arch/arm/ArmSystem.py b/src/arch/arm/ArmSystem.py
index dc138cafc3..94cbb496aa 100644
--- a/src/arch/arm/ArmSystem.py
+++ b/src/arch/arm/ArmSystem.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2009, 2012-2013, 2015-2023 Arm Limited
+# Copyright (c) 2009, 2012-2013, 2015-2024 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -65,6 +65,11 @@ class SmeVectorLength(UInt8):
 
 class ArmExtension(ScopedEnum):
     vals = [
+        "FEAT_AES",
+        "FEAT_PMULL",
+        "FEAT_SHA1",
+        "FEAT_SHA256",
+        "FEAT_CRC32",
         # Armv8.1
         "FEAT_VHE",
         "FEAT_PAN",
@@ -109,8 +114,8 @@ class ArmExtension(ScopedEnum):
         "SECURITY",
         "LPAE",
         "VIRTUALIZATION",
-        "CRYPTO",
         "TME",
+        "FEAT_MPAM",
     ]
 
 
@@ -159,7 +164,16 @@ class ArmRelease(SimObject):
 
 
 class Armv8(ArmRelease):
-    extensions = ["LPAE", "VIRTUALIZATION", "SECURITY"]
+    extensions = [
+        "LPAE",
+        "VIRTUALIZATION",
+        "SECURITY",
+        "FEAT_AES",
+        "FEAT_PMULL",
+        "FEAT_SHA1",
+        "FEAT_SHA256",
+        "FEAT_CRC32",
+    ]
 
 
 class ArmDefaultRelease(Armv8):
diff --git a/src/arch/arm/SConscript b/src/arch/arm/SConscript
index 0aa4e66659..948cefb4e4 100644
--- a/src/arch/arm/SConscript
+++ b/src/arch/arm/SConscript
@@ -1,6 +1,6 @@
 # -*- mode:python -*-
 
-# Copyright (c) 2009, 2012-2013, 2017-2018, 2020 ARM Limited
+# Copyright (c) 2009, 2012-2013, 2017-2018, 2020, 2024 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -40,6 +40,8 @@
 
 Import('*')
 
+Source('insts/fplib.cc')
+
 if env['CONF']['USE_ARM_ISA']:
     env.TagImplies('arm isa', 'gem5 lib')
 
@@ -73,7 +75,6 @@ Source('insts/static_inst.cc', tags='arm isa')
 Source('insts/sve.cc', tags='arm isa')
 Source('insts/sve_mem.cc', tags='arm isa')
 Source('insts/vfp.cc', tags='arm isa')
-Source('insts/fplib.cc', tags='arm isa')
 Source('insts/crypto.cc', tags='arm isa')
 Source('insts/tme64.cc', tags='arm isa')
 if env['CONF']['PROTOCOL'] == 'MESI_Three_Level_HTM':
@@ -92,6 +93,7 @@ Source('fs_workload.cc', tags='arm isa')
 Source('regs/int.cc', tags='arm isa')
 Source('regs/misc.cc', tags='arm isa')
 Source('mmu.cc', tags='arm isa')
+Source('mpam.cc', tags='arm isa')
 Source('nativetrace.cc', tags='arm isa')
 Source('pagetable.cc', tags='arm isa')
 Source('pauth_helpers.cc', tags='arm isa')
@@ -132,7 +134,7 @@ SimObject('ArmCPU.py', sim_objects=[], tags='arm isa')
 
 DebugFlag('Arm', tags='arm isa')
 DebugFlag('ArmTme', 'Transactional Memory Extension', tags='arm isa')
-DebugFlag('Semihosting', tags='arm isa')
+DebugFlag('MPAM', 'MPAM debug flag', tags='arm isa')
 DebugFlag('PMUVerbose', "Performance Monitor", tags='arm isa')
 
 # Add files generated by the ISA description.
diff --git a/src/arch/arm/fastmodel/remote_gdb.cc b/src/arch/arm/fastmodel/remote_gdb.cc
index 555439ed75..0999f40ddd 100644
--- a/src/arch/arm/fastmodel/remote_gdb.cc
+++ b/src/arch/arm/fastmodel/remote_gdb.cc
@@ -63,7 +63,7 @@ FastmodelRemoteGDB::AArch64GdbRegCache::setRegs(ThreadContext *context) const
 
 FastmodelRemoteGDB::FastmodelRemoteGDB(System *_system,
         ListenSocketConfig _listen_config)
-    : gem5::ArmISA::RemoteGDB(_system, _listen_config)
+    : gem5::ArmISA::RemoteGDB(_system, _listen_config), regCache64(this)
 {
 }
 
diff --git a/src/arch/arm/faults.cc b/src/arch/arm/faults.cc
index 4b906f226f..4c5e2111f4 100644
--- a/src/arch/arm/faults.cc
+++ b/src/arch/arm/faults.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010, 2012-2014, 2016-2019, 2022 Arm Limited
+ * Copyright (c) 2010, 2012-2014, 2016-2019, 2022, 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -44,6 +44,7 @@
 #include "arch/arm/insts/static_inst.hh"
 #include "arch/arm/interrupts.hh"
 #include "arch/arm/isa.hh"
+#include "arch/arm/regs/misc_accessors.hh"
 #include "arch/arm/self_debug.hh"
 #include "arch/arm/system.hh"
 #include "arch/arm/utility.hh"
@@ -378,22 +379,6 @@ ArmFault::getSyndromeReg64() const
     }
 }
 
-MiscRegIndex
-ArmFault::getFaultAddrReg64() const
-{
-    switch (toEL) {
-      case EL1:
-        return MISCREG_FAR_EL1;
-      case EL2:
-        return MISCREG_FAR_EL2;
-      case EL3:
-        return MISCREG_FAR_EL3;
-      default:
-        panic("Invalid exception level");
-        break;
-    }
-}
-
 void
 ArmFault::setSyndrome(ThreadContext *tc, MiscRegIndex syndrome_reg)
 {
@@ -1113,13 +1098,15 @@ AbortFault<T>::invoke(ThreadContext *tc, const StaticInstPtr &inst)
         if (stage2) {
             // stage 2 fault, set HPFAR_EL2 to the faulting IPA
             // and FAR_EL2 to the Original VA
-            tc->setMiscReg(AbortFault<T>::getFaultAddrReg64(), OVAddr);
+            misc_regs::writeRegister<misc_regs::FarAccessor>(
+                tc, OVAddr, this->toEL);
             tc->setMiscReg(MISCREG_HPFAR_EL2, bits(faultAddr, 47, 12) << 4);
 
             DPRINTF(Faults, "Abort Fault (Stage 2) VA: 0x%x IPA: 0x%x\n",
                     OVAddr, faultAddr);
         } else {
-            tc->setMiscReg(AbortFault<T>::getFaultAddrReg64(), faultAddr);
+            misc_regs::writeRegister<misc_regs::FarAccessor>(
+                tc, faultAddr, this->toEL);
         }
     }
 }
@@ -1517,7 +1504,7 @@ PCAlignmentFault::invoke(ThreadContext *tc, const StaticInstPtr &inst)
     ArmFaultVals<PCAlignmentFault>::invoke(tc, inst);
     assert(from64);
     // Set the FAR
-    tc->setMiscReg(getFaultAddrReg64(), faultPC);
+    misc_regs::writeRegister<misc_regs::FarAccessor>(tc, faultPC, toEL);
 }
 
 bool
@@ -1661,8 +1648,7 @@ Watchpoint::invoke(ThreadContext *tc, const StaticInstPtr &inst)
 {
     ArmFaultVals<Watchpoint>::invoke(tc, inst);
     // Set the FAR
-    tc->setMiscReg(getFaultAddrReg64(), vAddr);
-
+    misc_regs::writeRegister<misc_regs::FarAccessor>(tc, vAddr, toEL);
 }
 
 bool
diff --git a/src/arch/arm/faults.hh b/src/arch/arm/faults.hh
index ec60d629f5..d289ac2f40 100644
--- a/src/arch/arm/faults.hh
+++ b/src/arch/arm/faults.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010, 2012-2013, 2016-2019, 2022 Arm Limited
+ * Copyright (c) 2010, 2012-2013, 2016-2019, 2022, 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -220,9 +220,6 @@ class ArmFault : public FaultBase
     // Returns the actual syndrome register to use based on the target
     // exception level
     MiscRegIndex getSyndromeReg64() const;
-    // Returns the actual fault address register to use based on the target
-    // exception level
-    MiscRegIndex getFaultAddrReg64() const;
 
     void invoke(ThreadContext *tc, const StaticInstPtr &inst =
                 nullStaticInstPtr) override;
diff --git a/src/arch/arm/insts/mem64.cc b/src/arch/arm/insts/mem64.cc
index 7576a5c2af..ead4428ad6 100644
--- a/src/arch/arm/insts/mem64.cc
+++ b/src/arch/arm/insts/mem64.cc
@@ -61,8 +61,8 @@ SysDC64::generateDisassembly(Addr pc, const loader::SymbolTable *symtab) const
 uint32_t
 SysDC64::iss() const
 {
-    const MiscRegNum64 &misc_reg = encodeAArch64SysReg(dest);
-    return _iss(misc_reg, base);
+    const auto misc_reg = encodeAArch64SysReg(dest);
+    return _iss(misc_reg.value(), base);
 }
 
 void
diff --git a/src/arch/arm/insts/misc.cc b/src/arch/arm/insts/misc.cc
index 546d2caebb..78d489ec12 100644
--- a/src/arch/arm/insts/misc.cc
+++ b/src/arch/arm/insts/misc.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010, 2012-2013, 2017-2018, 2021 Arm Limited
+ * Copyright (c) 2010, 2012-2013, 2017-2018, 2021, 2023-2024 Arm Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -408,11 +408,20 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
     switch (dest_idx) {
       case MISCREG_TLBIALL: // TLBI all entries, EL0&1,
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIALL tlbiOp(EL1, secure);
-            tlbiOp(tc);
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TLBIALL tlbiOp(TranslationRegime::EL10, secure);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // TLB Invalidate All, Inner Shareable
@@ -421,58 +430,94 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIALL tlbiOp(EL1, secure);
+            TLBIALL tlbiOp(TranslationRegime::EL10, secure);
             tlbiOp.broadcast(tc);
             return;
         }
       // Instruction TLB Invalidate All
       case MISCREG_ITLBIALL:
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            ITLBIALL tlbiOp(EL1, secure);
-            tlbiOp(tc);
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            ITLBIALL tlbiOp(TranslationRegime::EL10, secure);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // Data TLB Invalidate All
       case MISCREG_DTLBIALL:
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            DTLBIALL tlbiOp(EL1, secure);
-            tlbiOp(tc);
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            DTLBIALL tlbiOp(TranslationRegime::EL10, secure);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // TLB Invalidate by VA
       case MISCREG_TLBIMVA:
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVA tlbiOp(EL1,
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TLBIMVA tlbiOp(TranslationRegime::EL10,
                            secure,
                            mbits(value, 31, 12),
                            bits(value, 7, 0),
                            false);
 
-            tlbiOp(tc);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // TLB Invalidate by VA, Last Level
       case MISCREG_TLBIMVAL:
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVA tlbiOp(EL1,
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TLBIMVA tlbiOp(TranslationRegime::EL10,
                            secure,
                            mbits(value, 31, 12),
                            bits(value, 7, 0),
                            true);
 
-            tlbiOp(tc);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // TLB Invalidate by VA, Inner Shareable
@@ -481,7 +526,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVA tlbiOp(EL1,
+            TLBIMVA tlbiOp(TranslationRegime::EL10,
                            secure,
                            mbits(value, 31, 12),
                            bits(value, 7, 0),
@@ -496,7 +541,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVA tlbiOp(EL1,
+            TLBIMVA tlbiOp(TranslationRegime::EL10,
                            secure,
                            mbits(value, 31, 12),
                            bits(value, 7, 0),
@@ -508,14 +553,23 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
       // TLB Invalidate by ASID match
       case MISCREG_TLBIASID:
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIASID tlbiOp(EL1,
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TLBIASID tlbiOp(TranslationRegime::EL10,
                             secure,
                             bits(value, 7, 0));
 
-            tlbiOp(tc);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // TLB Invalidate by ASID match, Inner Shareable
@@ -524,7 +578,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIASID tlbiOp(EL1,
+            TLBIASID tlbiOp(TranslationRegime::EL10,
                             secure,
                             bits(value, 7, 0));
 
@@ -534,25 +588,42 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
       // TLB Invalidate by VA, All ASID
       case MISCREG_TLBIMVAA:
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(EL1, secure,
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+            TLBIMVAA tlbiOp(TranslationRegime::EL10, secure,
                             mbits(value, 31, 12), false);
 
-            tlbiOp(tc);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // TLB Invalidate by VA, Last Level, All ASID
       case MISCREG_TLBIMVAAL:
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(EL1, secure,
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TLBIMVAA tlbiOp(TranslationRegime::EL10, secure,
                             mbits(value, 31, 12), true);
 
-            tlbiOp(tc);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // TLB Invalidate by VA, All ASID, Inner Shareable
@@ -561,7 +632,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(EL1, secure,
+            TLBIMVAA tlbiOp(TranslationRegime::EL10, secure,
                             mbits(value, 31, 12), false);
 
             tlbiOp.broadcast(tc);
@@ -573,7 +644,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(EL1, secure,
+            TLBIMVAA tlbiOp(TranslationRegime::EL10, secure,
                             mbits(value, 31, 12), true);
 
             tlbiOp.broadcast(tc);
@@ -585,7 +656,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(EL2, secure,
+            TLBIMVAA tlbiOp(TranslationRegime::EL2, secure,
                             mbits(value, 31, 12), false);
 
             tlbiOp(tc);
@@ -597,7 +668,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(EL2, secure,
+            TLBIMVAA tlbiOp(TranslationRegime::EL2, secure,
                             mbits(value, 31, 12), true);
 
             tlbiOp(tc);
@@ -609,7 +680,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(EL2, secure,
+            TLBIMVAA tlbiOp(TranslationRegime::EL2, secure,
                             mbits(value, 31, 12), false);
 
             tlbiOp.broadcast(tc);
@@ -621,7 +692,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(EL2, secure,
+            TLBIMVAA tlbiOp(TranslationRegime::EL2, secure,
                             mbits(value, 31, 12), true);
 
             tlbiOp.broadcast(tc);
@@ -633,7 +704,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIIPA tlbiOp(EL1,
+            TLBIIPA tlbiOp(TranslationRegime::EL10,
                            secure,
                            static_cast<Addr>(bits(value, 35, 0)) << 12,
                            false);
@@ -648,7 +719,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIIPA tlbiOp(EL1,
+            TLBIIPA tlbiOp(TranslationRegime::EL10,
                            secure,
                            static_cast<Addr>(bits(value, 35, 0)) << 12,
                            true);
@@ -663,7 +734,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIIPA tlbiOp(EL1,
+            TLBIIPA tlbiOp(TranslationRegime::EL10,
                            secure,
                            static_cast<Addr>(bits(value, 35, 0)) << 12,
                            false);
@@ -678,7 +749,7 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIIPA tlbiOp(EL1,
+            TLBIIPA tlbiOp(TranslationRegime::EL10,
                            secure,
                            static_cast<Addr>(bits(value, 35, 0)) << 12,
                            true);
@@ -689,82 +760,117 @@ TlbiOp::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
       // Instruction TLB Invalidate by VA
       case MISCREG_ITLBIMVA:
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            ITLBIMVA tlbiOp(EL1,
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+            ITLBIMVA tlbiOp(TranslationRegime::EL10,
                             secure,
                             mbits(value, 31, 12),
                             bits(value, 7, 0));
 
-            tlbiOp(tc);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // Data TLB Invalidate by VA
       case MISCREG_DTLBIMVA:
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            DTLBIMVA tlbiOp(EL1,
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            DTLBIMVA tlbiOp(TranslationRegime::EL10,
                             secure,
                             mbits(value, 31, 12),
                             bits(value, 7, 0));
 
-            tlbiOp(tc);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // Instruction TLB Invalidate by ASID match
       case MISCREG_ITLBIASID:
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            ITLBIASID tlbiOp(EL1,
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            ITLBIASID tlbiOp(TranslationRegime::EL10,
                              secure,
                              bits(value, 7, 0));
 
-            tlbiOp(tc);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // Data TLB Invalidate by ASID match
       case MISCREG_DTLBIASID:
         {
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
             SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
             bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            DTLBIASID tlbiOp(EL1,
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            DTLBIASID tlbiOp(TranslationRegime::EL10,
                              secure,
                              bits(value, 7, 0));
 
-            tlbiOp(tc);
+            if (shareable) {
+                tlbiOp.broadcast(tc);
+            } else {
+                tlbiOp(tc);
+            }
             return;
         }
       // TLB Invalidate All, Non-Secure Non-Hyp
       case MISCREG_TLBIALLNSNH:
         {
-            TLBIALLN tlbiOp(EL1);
+            TLBIALLN tlbiOp(TranslationRegime::EL10);
             tlbiOp(tc);
             return;
         }
       // TLB Invalidate All, Non-Secure Non-Hyp, Inner Shareable
       case MISCREG_TLBIALLNSNHIS:
         {
-            TLBIALLN tlbiOp(EL1);
+            TLBIALLN tlbiOp(TranslationRegime::EL10);
             tlbiOp.broadcast(tc);
             return;
         }
       // TLB Invalidate All, Hyp mode
       case MISCREG_TLBIALLH:
         {
-            TLBIALLN tlbiOp(EL2);
+            TLBIALLN tlbiOp(TranslationRegime::EL2);
             tlbiOp(tc);
             return;
         }
       // TLB Invalidate All, Hyp mode, Inner Shareable
       case MISCREG_TLBIALLHIS:
         {
-            TLBIALLN tlbiOp(EL2);
+            TLBIALLN tlbiOp(TranslationRegime::EL2);
             tlbiOp.broadcast(tc);
             return;
         }
diff --git a/src/arch/arm/insts/misc64.cc b/src/arch/arm/insts/misc64.cc
index 4919d92da8..a5ca423ea6 100644
--- a/src/arch/arm/insts/misc64.cc
+++ b/src/arch/arm/insts/misc64.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013,2017-2023 Arm Limited
+ * Copyright (c) 2011-2013,2017-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -186,8 +186,9 @@ MiscRegRegImmOp64::generateDisassembly(
 uint32_t
 MiscRegRegImmOp64::iss() const
 {
-    const MiscRegNum64 &misc_reg = encodeAArch64SysReg(dest);
-    return _iss(misc_reg, op1);
+    const auto misc_reg = encodeAArch64SysReg(dest);
+    assert(misc_reg.has_value());
+    return _iss(misc_reg.value(), op1);
 }
 
 std::string
@@ -205,8 +206,9 @@ RegMiscRegImmOp64::generateDisassembly(
 uint32_t
 RegMiscRegImmOp64::iss() const
 {
-    const MiscRegNum64 &misc_reg = encodeAArch64SysReg(op1);
-    return _iss(misc_reg, dest);
+    const auto misc_reg = encodeAArch64SysReg(op1);
+    assert(misc_reg.has_value());
+    return _iss(misc_reg.value(), dest);
 }
 
 Fault
@@ -243,964 +245,1114 @@ RegNone::generateDisassembly(
     return ss.str();
 }
 
+void
+TlbiOp64::tlbiAll(ThreadContext *tc, RegVal value,
+    bool secure, TranslationRegime regime, bool shareable)
+{
+    TLBIALLEL tlbi_op(regime, secure);
+    if (shareable) {
+        tlbi_op.broadcast(tc);
+    } else {
+        tlbi_op(tc);
+    }
+}
+
+void
+TlbiOp64::tlbiVmall(ThreadContext *tc, RegVal value,
+    bool secure, TranslationRegime regime, bool shareable, bool stage2)
+{
+    TLBIVMALL tlbi_op(regime, secure, stage2);
+    if (shareable) {
+        tlbi_op.broadcast(tc);
+    } else {
+        tlbi_op(tc);
+    }
+}
+
+void
+TlbiOp64::tlbiVa(ThreadContext *tc, RegVal value,
+    bool secure, TranslationRegime regime, bool shareable, bool last_level)
+{
+    if (MMU::hasUnprivRegime(regime)) {
+        // The asid will only be used when e2h == 1
+        bool asid_16bits = ArmSystem::haveLargeAsid64(tc);
+        auto asid = asid_16bits ? bits(value, 63, 48) :
+                                  bits(value, 55, 48);
+
+        TLBIMVA tlbi_op(regime, secure, static_cast<Addr>(bits(value, 43, 0)) << 12,
+                        asid, last_level);
+        if (shareable) {
+            tlbi_op.broadcast(tc);
+        } else {
+            tlbi_op(tc);
+        }
+    } else {
+        TLBIMVAA tlbi_op(regime, secure, static_cast<Addr>(bits(value, 43, 0)) << 12, last_level);
+        if (shareable) {
+            tlbi_op.broadcast(tc);
+        } else {
+            tlbi_op(tc);
+        }
+    }
+}
+
+void
+TlbiOp64::tlbiVaa(ThreadContext *tc, RegVal value,
+    bool secure, TranslationRegime regime, bool shareable, bool last_level)
+{
+    TLBIMVAA tlbi_op(regime, secure, static_cast<Addr>(bits(value, 43, 0)) << 12, last_level);
+    if (shareable) {
+        tlbi_op.broadcast(tc);
+    } else {
+        tlbi_op(tc);
+    }
+}
+
+void
+TlbiOp64::tlbiAsid(ThreadContext *tc, RegVal value,
+    bool secure, TranslationRegime regime, bool shareable)
+{
+    bool asid_16bits = ArmSystem::haveLargeAsid64(tc);
+    auto asid = asid_16bits ? bits(value, 63, 48) :
+                              bits(value, 55, 48);
+
+    TLBIASID tlbi_op(regime, secure, asid);
+    if (shareable) {
+        tlbi_op.broadcast(tc);
+    } else {
+        tlbi_op(tc);
+    }
+}
+
+void
+TlbiOp64::tlbiIpaS2(ThreadContext *tc, RegVal value,
+    bool secure, TranslationRegime regime, bool shareable, bool last_level)
+{
+    if (EL2Enabled(tc)) {
+        auto isa = static_cast<ArmISA::ISA *>(tc->getIsaPtr());
+        auto release = isa->getRelease();
+
+        SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
+        bool secure = release->has(ArmExtension::SECURITY) &&
+            !scr.ns && !bits(value, 63);
+
+        const int top_bit = ArmSystem::physAddrRange(tc) == 52 ?
+            39 : 35;
+        TLBIIPA tlbi_op(TranslationRegime::EL10, secure,
+            static_cast<Addr>(bits(value, top_bit, 0)) << 12,
+            last_level);
+
+        if (shareable) {
+            tlbi_op.broadcast(tc);
+        } else {
+            tlbi_op(tc);
+        }
+    }
+}
+
+void
+TlbiOp64::tlbiRvaa(ThreadContext *tc, RegVal value,
+    bool secure, TranslationRegime regime, bool shareable, bool last_level)
+{
+    TLBIRMVAA tlbi_op(regime, secure, value, last_level);
+    if (shareable) {
+        tlbi_op.broadcast(tc);
+    } else {
+        tlbi_op(tc);
+    }
+}
+
+void
+TlbiOp64::tlbiRva(ThreadContext *tc, RegVal value,
+    bool secure, TranslationRegime regime, bool shareable, bool last_level)
+{
+    if (MMU::hasUnprivRegime(regime)) {
+        // The asid will only be used when e2h == 1
+        bool asid_16bits = ArmSystem::haveLargeAsid64(tc);
+        auto asid = asid_16bits ? bits(value, 63, 48) :
+                                  bits(value, 55, 48);
+
+        TLBIRMVA tlbi_op(regime, secure, value, asid, last_level);
+        if (shareable) {
+            tlbi_op.broadcast(tc);
+        } else {
+            tlbi_op(tc);
+        }
+    } else {
+        tlbiRvaa(tc, value, secure, regime, shareable, last_level);
+    }
+}
+
+void
+TlbiOp64::tlbiRipaS2(ThreadContext *tc, RegVal value,
+    bool secure, TranslationRegime regime, bool shareable, bool last_level)
+{
+    if (EL2Enabled(tc)) {
+        auto isa = static_cast<ArmISA::ISA *>(tc->getIsaPtr());
+        auto release = isa->getRelease();
+        SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
+        bool secure = release->has(ArmExtension::SECURITY) &&
+            !scr.ns && !bits(value, 63);
+
+        TLBIRIPA tlbi_op(TranslationRegime::EL10, secure, value, last_level);
+
+        if (shareable) {
+            tlbi_op.broadcast(tc);
+        } else {
+            tlbi_op(tc);
+        }
+    }
+}
+
+std::unordered_map<MiscRegIndex, TlbiOp64::TlbiFunc> TlbiOp64::tlbiOps = {
+    { MISCREG_TLBI_ALLE3, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiAll(tc, value,
+                true, // secure
+                TranslationRegime::EL3, // regime
+                false); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_ALLE3IS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiAll(tc, value,
+                true, // secure
+                TranslationRegime::EL3, // regime
+                true); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_ALLE3OS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiAll(tc, value,
+                true, // secure
+                TranslationRegime::EL3, // regime
+                true); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_ALLE2, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            TlbiOp64::tlbiAll(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                false); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_ALLE2IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            TlbiOp64::tlbiAll(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                true); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_ALLE2OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            TlbiOp64::tlbiAll(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                true); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_ALLE1, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiAll(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                false); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_ALLE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiAll(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_ALLE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiAll(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_VMALLE1, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TlbiOp64::tlbiVmall(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                shareable); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_VMALLE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiVmall(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_VMALLE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiVmall(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_VMALLS12E1, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiVmall(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                false, // shareable
+                true); // stage2
+        }
+    },
+
+    { MISCREG_TLBI_VMALLS12E1IS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiVmall(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true, // shareable
+                true); // stage2
+        }
+    },
+
+    { MISCREG_TLBI_VMALLS12E1OS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiVmall(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true, // shareable
+                true); // stage2
+        }
+    },
+
+    { MISCREG_TLBI_VAE3, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiVa(tc, value,
+                true, // secure
+                TranslationRegime::EL3, // regime
+                false, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VAE3IS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiVa(tc, value,
+                true, // secure
+                TranslationRegime::EL3, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VAE3OS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiVa(tc, value,
+                true, // secure
+                TranslationRegime::EL3, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VALE3, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiVa(tc, value,
+                true, // secure
+                TranslationRegime::EL3, // regime
+                false, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VALE3IS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiVa(tc, value,
+                true, // secure
+                TranslationRegime::EL3, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VALE3OS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiVa(tc, value,
+                true, // secure
+                TranslationRegime::EL3, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VAE2, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                false, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VAE2IS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                TranslationRegime::EL2, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VAE2OS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc,EL2), // secure
+                TranslationRegime::EL2, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VALE2, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                false, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VALE2IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VALE2OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VAE1, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                shareable, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VAE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VAE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VALE1, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                false, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VALE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_VALE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiVa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_ASIDE1, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TlbiOp64::tlbiAsid(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                shareable); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_ASIDE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiAsid(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_ASIDE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiAsid(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true); // shareable
+        }
+    },
+
+    { MISCREG_TLBI_VAAE1, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TlbiOp64::tlbiVaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                shareable, // shareable
+                false); // last level
+        }
+    },
+
+    { MISCREG_TLBI_VAAE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiVaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                false); // last level
+        }
+    },
+
+    { MISCREG_TLBI_VAAE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiVaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                false); // last level
+        }
+    },
+
+    { MISCREG_TLBI_VAALE1, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TlbiOp64::tlbiVaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                shareable, // shareable
+                true); // last level
+        }
+    },
+
+    { MISCREG_TLBI_VAALE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiVaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level
+        }
+    },
+
+    { MISCREG_TLBI_VAALE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiVaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level
+        }
+    },
+
+    { MISCREG_TLBI_IPAS2E1, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiIpaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                false, // shareable
+                false); // last level
+        }
+    },
+
+    { MISCREG_TLBI_IPAS2E1IS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiIpaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true, // shareable
+                false); // last level
+        }
+    },
+
+    { MISCREG_TLBI_IPAS2E1OS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiIpaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true, // shareable
+                false); // last level
+        }
+    },
+
+    { MISCREG_TLBI_IPAS2LE1, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiIpaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                false, // shareable
+                true); // last level
+        }
+    },
+
+    { MISCREG_TLBI_IPAS2LE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiIpaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true, // shareable
+                true); // last level
+        }
+    },
+
+    { MISCREG_TLBI_IPAS2LE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiIpaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true, // shareable
+                true); // last level
+        }
+    },
+
+    { MISCREG_TLBI_RVAE1, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                shareable, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAAE1, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TlbiOp64::tlbiRvaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                shareable, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAAE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiRvaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAAE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiRvaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVALE1, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                shareable, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVALE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVALE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAALE1, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            // Check for Force Broadcast. Ignored if HCR_EL2.TGE == 1
+            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
+            bool shareable = currEL(tc) == EL1 && EL2Enabled(tc) &&
+                hcr.fb && !hcr.tge;
+
+            TlbiOp64::tlbiRvaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                shareable, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAALE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiRvaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAALE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL0) ?
+                TranslationRegime::EL20 : TranslationRegime::EL10;
+
+            TlbiOp64::tlbiRvaa(tc, value,
+                isSecureAtEL(tc, translationEl(regime)), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RIPAS2E1, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiRipaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                false, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RIPAS2E1IS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiRipaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RIPAS2E1OS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiRipaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RIPAS2LE1, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiRipaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                false, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RIPAS2LE1IS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiRipaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RIPAS2LE1OS, [](ThreadContext *tc, RegVal value)
+        {
+            TlbiOp64::tlbiRipaS2(tc, value,
+                isSecureAtEL(tc, EL1), // secure
+                TranslationRegime::EL10, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAE2, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                false, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAE2IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAE2OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVALE2, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                false, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVALE2IS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVALE2OS, [](ThreadContext *tc, RegVal value)
+        {
+            const TranslationRegime regime = ELIsInHost(tc, EL2) ?
+                TranslationRegime::EL20 : TranslationRegime::EL2;
+
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL2), // secure
+                regime, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAE3, [](ThreadContext *tc, RegVal value)
+        {
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL3), // secure
+                TranslationRegime::EL3, // regime
+                false, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAE3IS, [](ThreadContext *tc, RegVal value)
+        {
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL3), // secure
+                TranslationRegime::EL3, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVAE3OS, [](ThreadContext *tc, RegVal value)
+        {
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL3), // secure
+                TranslationRegime::EL3, // regime
+                true, // shareable
+                false); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVALE3, [](ThreadContext *tc, RegVal value)
+        {
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL3), // secure
+                TranslationRegime::EL3, // regime
+                false, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVALE3IS, [](ThreadContext *tc, RegVal value)
+        {
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL3), // secure
+                TranslationRegime::EL3, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+
+    { MISCREG_TLBI_RVALE3OS, [](ThreadContext *tc, RegVal value)
+        {
+            tlbiRva(tc, value,
+                isSecureAtEL(tc, EL3), // secure
+                TranslationRegime::EL3, // regime
+                true, // shareable
+                true); // last level only
+        }
+    },
+};
+
 void
 TlbiOp64::performTlbi(ExecContext *xc, MiscRegIndex dest_idx, RegVal value) const
 {
     ThreadContext* tc = xc->tcBase();
-    auto isa = static_cast<ArmISA::ISA *>(tc->getIsaPtr());
-    auto release = isa->getRelease();
 
-    bool asid_16bits = ArmSystem::haveLargeAsid64(tc);
-
-    switch (dest_idx) {
-      // AArch64 TLB Invalidate All, EL3
-      case MISCREG_TLBI_ALLE3:
-        {
-            TLBIALLEL tlbiOp(EL3, true);
-            tlbiOp(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate All, EL3, Inner Shareable
-      case MISCREG_TLBI_ALLE3IS:
-      // AArch64 TLB Invalidate All, EL3, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_ALLE3OS:
-        {
-            TLBIALLEL tlbiOp(EL3, true);
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate All, EL2
-      case MISCREG_TLBI_ALLE2:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIALLEL tlbiOp(EL2, secure);
-            tlbiOp(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate All, EL2, Inner Shareable
-      case MISCREG_TLBI_ALLE2IS:
-      // AArch64 TLB Invalidate All, EL2, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_ALLE2OS:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIALLEL tlbiOp(EL2, secure);
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate All, EL1
-      case MISCREG_TLBI_ALLE1:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIALLEL tlbiOp(EL1, secure);
-            tlbiOp(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate All, EL1, Inner Shareable
-      case MISCREG_TLBI_ALLE1IS:
-      // AArch64 TLB Invalidate All, EL1, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_ALLE1OS:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIALLEL tlbiOp(EL1, secure);
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      case MISCREG_TLBI_VMALLS12E1:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIVMALL tlbiOp(EL1, secure, true);
-            tlbiOp(tc);
-            return;
-        }
-      case MISCREG_TLBI_VMALLE1:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIVMALL tlbiOp(target_el, secure, false);
-            tlbiOp(tc);
-            return;
-        }
-      case MISCREG_TLBI_VMALLS12E1IS:
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_VMALLS12E1OS:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIVMALL tlbiOp(EL1, secure, true);
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      case MISCREG_TLBI_VMALLE1IS:
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_VMALLE1OS:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIVMALL tlbiOp(target_el, secure, false);
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, EL3
-      case MISCREG_TLBI_VAE3_Xt:
-        {
-
-            TLBIMVAA tlbiOp(EL3, true,
-                            static_cast<Addr>(bits(value, 43, 0)) << 12,
-                            false);
-            tlbiOp(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, Last Level, EL3
-      case MISCREG_TLBI_VALE3_Xt:
-        {
-
-            TLBIMVAA tlbiOp(EL3, true,
-                            static_cast<Addr>(bits(value, 43, 0)) << 12,
-                            true);
-            tlbiOp(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, EL3, Inner Shareable
-      case MISCREG_TLBI_VAE3IS_Xt:
-      // AArch64 TLB Invalidate by VA, EL3, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_VAE3OS_Xt:
-        {
-            TLBIMVAA tlbiOp(EL3, true,
-                            static_cast<Addr>(bits(value, 43, 0)) << 12,
-                            false);
-
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, Last Level, EL3, Inner Shareable
-      case MISCREG_TLBI_VALE3IS_Xt:
-      // AArch64 TLB Invalidate by VA, Last Level, EL3, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_VALE3OS_Xt:
-        {
-            TLBIMVAA tlbiOp(EL3, true,
-                            static_cast<Addr>(bits(value, 43, 0)) << 12,
-                            true);
-
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, EL2
-      case MISCREG_TLBI_VAE2_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-
-            if (hcr.e2h) {
-                // The asid will only be used when e2h == 1
-                auto asid = asid_16bits ? bits(value, 63, 48) :
-                                          bits(value, 55, 48);
-
-                TLBIMVA tlbiOp(EL2, secure,
-                               static_cast<Addr>(bits(value, 43, 0)) << 12,
-                               asid, false);
-                tlbiOp(tc);
-            } else {
-                TLBIMVAA tlbiOp(EL2, secure,
-                                static_cast<Addr>(bits(value, 43, 0)) << 12,
-                                false);
-                tlbiOp(tc);
-            }
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, Last Level, EL2
-      case MISCREG_TLBI_VALE2_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-
-            if (hcr.e2h) {
-                // The asid will only be used when e2h == 1
-                auto asid = asid_16bits ? bits(value, 63, 48) :
-                                          bits(value, 55, 48);
-
-                TLBIMVA tlbiOp(EL2, secure,
-                               static_cast<Addr>(bits(value, 43, 0)) << 12,
-                               asid, true);
-                tlbiOp(tc);
-            } else {
-                TLBIMVAA tlbiOp(EL2, secure,
-                                static_cast<Addr>(bits(value, 43, 0)) << 12,
-                                true);
-                tlbiOp(tc);
-            }
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, EL2, Inner Shareable
-      case MISCREG_TLBI_VAE2IS_Xt:
-      // AArch64 TLB Invalidate by VA, EL2, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_VAE2OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-
-            if (hcr.e2h) {
-                // The asid will only be used when e2h == 1
-                auto asid = asid_16bits ? bits(value, 63, 48) :
-                                          bits(value, 55, 48);
-
-                TLBIMVA tlbiOp(EL2, secure,
-                               static_cast<Addr>(bits(value, 43, 0)) << 12,
-                               asid, false);
-                tlbiOp.broadcast(tc);
-            } else {
-                TLBIMVAA tlbiOp(EL2, secure,
-                                static_cast<Addr>(bits(value, 43, 0)) << 12,
-                                false);
-                tlbiOp.broadcast(tc);
-            }
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, Last Level, EL2, Inner Shareable
-      case MISCREG_TLBI_VALE2IS_Xt:
-      // AArch64 TLB Invalidate by VA, Last Level, EL2, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_VALE2OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-
-            if (hcr.e2h) {
-                // The asid will only be used when e2h == 1
-                auto asid = asid_16bits ? bits(value, 63, 48) :
-                                          bits(value, 55, 48);
-
-                TLBIMVA tlbiOp(EL2, secure,
-                               static_cast<Addr>(bits(value, 43, 0)) << 12,
-                               asid, true);
-                tlbiOp.broadcast(tc);
-            } else {
-                TLBIMVAA tlbiOp(EL2, secure,
-                                static_cast<Addr>(bits(value, 43, 0)) << 12,
-                                true);
-                tlbiOp.broadcast(tc);
-            }
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, EL1
-      case MISCREG_TLBI_VAE1_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            auto asid = asid_16bits ? bits(value, 63, 48) :
-                                      bits(value, 55, 48);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVA tlbiOp(target_el, secure,
-                           static_cast<Addr>(bits(value, 43, 0)) << 12,
-                           asid, false);
-
-            tlbiOp(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, Last Level, EL1
-      case MISCREG_TLBI_VALE1_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            auto asid = asid_16bits ? bits(value, 63, 48) :
-                                      bits(value, 55, 48);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVA tlbiOp(target_el, secure,
-                           static_cast<Addr>(bits(value, 43, 0)) << 12,
-                           asid, true);
-
-            tlbiOp(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, EL1, Inner Shareable
-      case MISCREG_TLBI_VAE1IS_Xt:
-      // AArch64 TLB Invalidate by VA, EL1, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_VAE1OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            auto asid = asid_16bits ? bits(value, 63, 48) :
-                                      bits(value, 55, 48);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVA tlbiOp(target_el, secure,
-                            static_cast<Addr>(bits(value, 43, 0)) << 12,
-                            asid, false);
-
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      case MISCREG_TLBI_VALE1IS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            auto asid = asid_16bits ? bits(value, 63, 48) :
-                                      bits(value, 55, 48);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVA tlbiOp(target_el, secure,
-                            static_cast<Addr>(bits(value, 43, 0)) << 12,
-                            asid, true);
-
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by ASID, EL1
-      case MISCREG_TLBI_ASIDE1_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            auto asid = asid_16bits ? bits(value, 63, 48) :
-                                      bits(value, 55, 48);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIASID tlbiOp(target_el, secure, asid);
-            tlbiOp(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by ASID, EL1, Inner Shareable
-      case MISCREG_TLBI_ASIDE1IS_Xt:
-      // AArch64 TLB Invalidate by ASID, EL1, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_ASIDE1OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            auto asid = asid_16bits ? bits(value, 63, 48) :
-                                      bits(value, 55, 48);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIASID tlbiOp(target_el, secure, asid);
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, All ASID, EL1
-      case MISCREG_TLBI_VAAE1_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(target_el, secure,
-                static_cast<Addr>(bits(value, 43, 0)) << 12,
-                false);
-
-            tlbiOp(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, Last Level, All ASID, EL1
-      case MISCREG_TLBI_VAALE1_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(target_el, secure,
-                static_cast<Addr>(bits(value, 43, 0)) << 12,
-                true);
-
-            tlbiOp(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, All ASID, EL1, Inner Shareable
-      case MISCREG_TLBI_VAAE1IS_Xt:
-      // AArch64 TLB Invalidate by VA, All ASID, EL1, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_VAAE1OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(target_el, secure,
-                static_cast<Addr>(bits(value, 43, 0)) << 12,
-                false);
-
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by VA, All ASID,
-      // Last Level, EL1, Inner Shareable
-      case MISCREG_TLBI_VAALE1IS_Xt:
-      // AArch64 TLB Invalidate by VA, All ASID,
-      // Last Level, EL1, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_VAALE1OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIMVAA tlbiOp(target_el, secure,
-                static_cast<Addr>(bits(value, 43, 0)) << 12,
-                true);
-
-            tlbiOp.broadcast(tc);
-            return;
-        }
-      // AArch64 TLB Invalidate by Intermediate Physical Address,
-      // Stage 2, EL1
-      case MISCREG_TLBI_IPAS2E1_Xt:
-        {
-            if (EL2Enabled(tc)) {
-                SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-                bool secure = release->has(ArmExtension::SECURITY) &&
-                    !scr.ns && !bits(value, 63);
-
-                const int top_bit = ArmSystem::physAddrRange(tc) == 52 ?
-                    39 : 35;
-                TLBIIPA tlbiOp(EL1, secure,
-                    static_cast<Addr>(bits(value, top_bit, 0)) << 12,
-                    false);
-
-                tlbiOp(tc);
-            }
-            return;
-        }
-      // AArch64 TLB Invalidate by Intermediate Physical Address,
-      // Stage 2, Last Level EL1
-      case MISCREG_TLBI_IPAS2LE1_Xt:
-        {
-            if (EL2Enabled(tc)) {
-                SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-                bool secure = release->has(ArmExtension::SECURITY) &&
-                    !scr.ns && !bits(value, 63);
-
-                TLBIIPA tlbiOp(EL1, secure,
-                    static_cast<Addr>(bits(value, 35, 0)) << 12,
-                    true);
-
-                tlbiOp(tc);
-            }
-            return;
-        }
-      // AArch64 TLB Invalidate by Intermediate Physical Address,
-      // Stage 2, EL1, Inner Shareable
-      case MISCREG_TLBI_IPAS2E1IS_Xt:
-      // AArch64 TLB Invalidate by Intermediate Physical Address,
-      // Stage 2, EL1, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_IPAS2E1OS_Xt:
-        {
-            if (EL2Enabled(tc)) {
-                SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-                bool secure = release->has(ArmExtension::SECURITY) &&
-                    !scr.ns && !bits(value, 63);
-
-                const int top_bit = ArmSystem::physAddrRange(tc) == 52 ?
-                    39 : 35;
-                TLBIIPA tlbiOp(EL1, secure,
-                    static_cast<Addr>(bits(value, top_bit, 0)) << 12,
-                    false);
-
-                tlbiOp.broadcast(tc);
-            }
-            return;
-        }
-      // AArch64 TLB Invalidate by Intermediate Physical Address,
-      // Stage 2, Last Level, EL1, Inner Shareable
-      case MISCREG_TLBI_IPAS2LE1IS_Xt:
-      // AArch64 TLB Invalidate by Intermediate Physical Address,
-      // Stage 2, Last Level, EL1, Outer Shareable
-      // We are currently not distinguishing Inner and Outer domains.
-      // We therefore implement TLBIOS instructions as TLBIIS
-      case MISCREG_TLBI_IPAS2LE1OS_Xt:
-        {
-            if (EL2Enabled(tc)) {
-                SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-                bool secure = release->has(ArmExtension::SECURITY) &&
-                    !scr.ns && !bits(value, 63);
-
-                TLBIIPA tlbiOp(EL1, secure,
-                    static_cast<Addr>(bits(value, 35, 0)) << 12,
-                    true);
-
-                tlbiOp.broadcast(tc);
-            }
-            return;
-        }
-      case MISCREG_TLBI_RVAE1_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            auto asid = asid_16bits ? bits(value, 63, 48) :
-                                      bits(value, 55, 48);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIRMVA tlbiOp(target_el, secure, value, asid, false);
-
-            if (tlbiOp.valid())
-                tlbiOp(tc);
-            return;
-        }
-      case MISCREG_TLBI_RVAE1IS_Xt:
-      case MISCREG_TLBI_RVAE1OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            auto asid = asid_16bits ? bits(value, 63, 48) :
-                                      bits(value, 55, 48);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIRMVA tlbiOp(target_el, secure, value, asid, false);
-
-            if (tlbiOp.valid())
-                tlbiOp.broadcast(tc);
-            return;
-        }
-      case MISCREG_TLBI_RVAAE1_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIRMVAA tlbiOp(target_el, secure, value, false);
-
-            if (tlbiOp.valid())
-                tlbiOp(tc);
-            return;
-        }
-      case MISCREG_TLBI_RVAAE1IS_Xt:
-      case MISCREG_TLBI_RVAAE1OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIRMVAA tlbiOp(target_el, secure, value, false);
-
-            if (tlbiOp.valid())
-                tlbiOp.broadcast(tc);
-            return;
-        }
-      case MISCREG_TLBI_RVALE1_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            auto asid = asid_16bits ? bits(value, 63, 48) :
-                                      bits(value, 55, 48);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIRMVA tlbiOp(target_el, secure, value, asid, true);
-
-            if (tlbiOp.valid())
-                tlbiOp(tc);
-            return;
-        }
-      case MISCREG_TLBI_RVALE1IS_Xt:
-      case MISCREG_TLBI_RVALE1OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            auto asid = asid_16bits ? bits(value, 63, 48) :
-                                      bits(value, 55, 48);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIRMVA tlbiOp(target_el, secure, value, asid, true);
-
-            if (tlbiOp.valid())
-                tlbiOp.broadcast(tc);
-            return;
-        }
-      case MISCREG_TLBI_RVAALE1_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIRMVAA tlbiOp(target_el, secure, value, true);
-
-            if (tlbiOp.valid())
-                tlbiOp(tc);
-            return;
-        }
-      case MISCREG_TLBI_RVAALE1IS_Xt:
-      case MISCREG_TLBI_RVAALE1OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-            ExceptionLevel target_el = EL1;
-            if (EL2Enabled(tc)) {
-                HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-                if (hcr.tge && hcr.e2h) {
-                    target_el = EL2;
-                }
-            }
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-            TLBIRMVAA tlbiOp(target_el, secure, value, true);
-
-            if (tlbiOp.valid())
-                tlbiOp.broadcast(tc);
-            return;
-        }
-      case MISCREG_TLBI_RIPAS2E1_Xt:
-        {
-            if (EL2Enabled(tc)) {
-                SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-                bool secure = release->has(ArmExtension::SECURITY) &&
-                    !scr.ns && !bits(value, 63);
-
-                TLBIRIPA tlbiOp(EL1, secure, value, false);
-
-                tlbiOp(tc);
-            }
-            return;
-        }
-      case MISCREG_TLBI_RIPAS2E1IS_Xt:
-        {
-            if (EL2Enabled(tc)) {
-                SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-                bool secure = release->has(ArmExtension::SECURITY) &&
-                    !scr.ns && !bits(value, 63);
-
-                TLBIRIPA tlbiOp(EL1, secure, value, false);
-
-                tlbiOp.broadcast(tc);
-            }
-            return;
-        }
-      case MISCREG_TLBI_RIPAS2LE1_Xt:
-        {
-            if (EL2Enabled(tc)) {
-                SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-                bool secure = release->has(ArmExtension::SECURITY) &&
-                    !scr.ns && !bits(value, 63);
-
-                TLBIRIPA tlbiOp(EL1, secure, value, true);
-
-                tlbiOp(tc);
-            }
-            return;
-        }
-      case MISCREG_TLBI_RIPAS2LE1IS_Xt:
-        {
-            if (EL2Enabled(tc)) {
-                SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-
-                bool secure = release->has(ArmExtension::SECURITY) &&
-                    !scr.ns && !bits(value, 63);
-
-                TLBIRIPA tlbiOp(EL1, secure, value, true);
-
-                tlbiOp.broadcast(tc);
-            }
-            return;
-        }
-      case MISCREG_TLBI_RVAE2_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-
-            if (hcr.e2h) {
-                // The asid will only be used when e2h == 1
-                auto asid = asid_16bits ? bits(value, 63, 48) :
-                                          bits(value, 55, 48);
-
-                TLBIRMVA tlbiOp(EL2, secure, value, asid, false);
-
-                if (tlbiOp.valid())
-                    tlbiOp(tc);
-            } else {
-                TLBIRMVAA tlbiOp(EL2, secure, value, false);
-
-                if (tlbiOp.valid())
-                    tlbiOp(tc);
-            }
-            return;
-        }
-      case MISCREG_TLBI_RVAE2IS_Xt:
-      case MISCREG_TLBI_RVAE2OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-
-            if (hcr.e2h) {
-                // The asid will only be used when e2h == 1
-                auto asid = asid_16bits ? bits(value, 63, 48) :
-                                          bits(value, 55, 48);
-
-                TLBIRMVA tlbiOp(EL2, secure, value, asid, false);
-
-                if (tlbiOp.valid())
-                    tlbiOp.broadcast(tc);
-            } else {
-                TLBIRMVAA tlbiOp(EL2, secure, value, false);
-
-                if (tlbiOp.valid())
-                    tlbiOp.broadcast(tc);
-            }
-            return;
-        }
-      case MISCREG_TLBI_RVALE2_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-
-            if (hcr.e2h) {
-                // The asid will only be used when e2h == 1
-                auto asid = asid_16bits ? bits(value, 63, 48) :
-                                          bits(value, 55, 48);
-
-                TLBIRMVA tlbiOp(EL2, secure, value, asid, true);
-
-                if (tlbiOp.valid())
-                    tlbiOp(tc);
-            } else {
-                TLBIRMVAA tlbiOp(EL2, secure, value, true);
-
-                if (tlbiOp.valid())
-                    tlbiOp(tc);
-            }
-            return;
-        }
-      case MISCREG_TLBI_RVALE2IS_Xt:
-      case MISCREG_TLBI_RVALE2OS_Xt:
-        {
-            SCR scr = tc->readMiscReg(MISCREG_SCR_EL3);
-            HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-
-            bool secure = release->has(ArmExtension::SECURITY) && !scr.ns;
-
-            if (hcr.e2h) {
-                // The asid will only be used when e2h == 1
-                auto asid = asid_16bits ? bits(value, 63, 48) :
-                                          bits(value, 55, 48);
-
-                TLBIRMVA tlbiOp(EL2, secure, value, asid, true);
-
-                if (tlbiOp.valid())
-                    tlbiOp.broadcast(tc);
-            } else {
-                TLBIRMVAA tlbiOp(EL2, secure, value, true);
-
-                if (tlbiOp.valid())
-                    tlbiOp.broadcast(tc);
-            }
-            return;
-        }
-      case MISCREG_TLBI_RVAE3_Xt:
-        {
-            TLBIRMVAA tlbiOp(EL3, true, value, false);
-            if (tlbiOp.valid())
-                tlbiOp(tc);
-            return;
-        }
-      case MISCREG_TLBI_RVAE3IS_Xt:
-      case MISCREG_TLBI_RVAE3OS_Xt:
-        {
-            TLBIRMVAA tlbiOp(EL3, true, value, false);
-            if (tlbiOp.valid())
-                tlbiOp.broadcast(tc);
-            return;
-        }
-      case MISCREG_TLBI_RVALE3_Xt:
-        {
-            TLBIRMVAA tlbiOp(EL3, true, value, true);
-            if (tlbiOp.valid())
-                tlbiOp(tc);
-            return;
-        }
-      case MISCREG_TLBI_RVALE3IS_Xt:
-      case MISCREG_TLBI_RVALE3OS_Xt:
-        {
-            TLBIRMVAA tlbiOp(EL3, true, value, true);
-            if (tlbiOp.valid())
-                tlbiOp.broadcast(tc);
-            return;
-        }
-      default:
+    if (auto it = tlbiOps.find(dest_idx); it != tlbiOps.end()) {
+        it->second(tc, value);
+    } else {
         panic("Invalid TLBI\n");
     }
 }
diff --git a/src/arch/arm/insts/misc64.hh b/src/arch/arm/insts/misc64.hh
index 3a67210b92..14ed41cb75 100644
--- a/src/arch/arm/insts/misc64.hh
+++ b/src/arch/arm/insts/misc64.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2013,2017-2019, 2021-2022 Arm Limited
+ * Copyright (c) 2011-2013,2017-2019, 2021-2022, 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -39,6 +39,7 @@
 #define __ARCH_ARM_INSTS_MISC64_HH__
 
 #include "arch/arm/insts/static_inst.hh"
+#include "arch/arm/types.hh"
 
 namespace gem5
 {
@@ -283,6 +284,45 @@ class RegNone : public ArmISA::ArmStaticInst
 
 class TlbiOp64 : public MiscRegRegImmOp64
 {
+  protected:
+    using TlbiFunc = std::function<void(ThreadContext*,RegVal)>;
+
+    static std::unordered_map<ArmISA::MiscRegIndex, TlbiFunc> tlbiOps;
+
+    static void tlbiAll(ThreadContext *tc, RegVal value,
+        bool secure, ArmISA::TranslationRegime regime, bool shareable);
+
+    static void tlbiVmall(ThreadContext *tc, RegVal value,
+        bool secure, ArmISA::TranslationRegime regime, bool shareable,
+        bool stage2=false);
+
+    static void tlbiVa(ThreadContext *tc, RegVal value,
+        bool secure, ArmISA::TranslationRegime regime, bool shareable,
+        bool last_level);
+
+    static void tlbiVaa(ThreadContext *tc, RegVal value,
+        bool secure, ArmISA::TranslationRegime regime, bool shareable,
+        bool last_level);
+
+    static void tlbiAsid(ThreadContext *tc, RegVal value,
+        bool secure, ArmISA::TranslationRegime regime, bool shareable);
+
+    static void tlbiIpaS2(ThreadContext *tc, RegVal value,
+        bool secure, ArmISA::TranslationRegime regime, bool shareable,
+        bool last_level);
+
+    static void tlbiRvaa(ThreadContext *tc, RegVal value,
+        bool secure, ArmISA::TranslationRegime regime, bool shareable,
+        bool last_level);
+
+    static void tlbiRva(ThreadContext *tc, RegVal value,
+        bool secure, ArmISA::TranslationRegime regime, bool shareable,
+        bool last_level);
+
+    static void tlbiRipaS2(ThreadContext *tc, RegVal value,
+        bool secure, ArmISA::TranslationRegime regime, bool shareable,
+        bool last_level);
+
   protected:
     TlbiOp64(const char *mnem, ArmISA::ExtMachInst _machInst,
              OpClass __opClass, ArmISA::MiscRegIndex _dest,
diff --git a/src/arch/arm/insts/pseudo.cc b/src/arch/arm/insts/pseudo.cc
index 3d017c1857..0402071255 100644
--- a/src/arch/arm/insts/pseudo.cc
+++ b/src/arch/arm/insts/pseudo.cc
@@ -116,6 +116,7 @@ FailUnimplemented::FailUnimplemented(const char *_mnemonic,
     // don't call execute() (which panics) if we're on a
     // speculative path
     flags[IsNonSpeculative] = true;
+    flags[IsInvalid] = true;
 }
 
 FailUnimplemented::FailUnimplemented(const char *_mnemonic,
@@ -127,6 +128,7 @@ FailUnimplemented::FailUnimplemented(const char *_mnemonic,
     // don't call execute() (which panics) if we're on a
     // speculative path
     flags[IsNonSpeculative] = true;
+    flags[IsInvalid] = true;
 }
 
 Fault
diff --git a/src/arch/arm/isa.cc b/src/arch/arm/isa.cc
index f961a2d2c4..69ca95d306 100644
--- a/src/arch/arm/isa.cc
+++ b/src/arch/arm/isa.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2023 Arm Limited
+ * Copyright (c) 2010-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -44,6 +44,7 @@
 #include "arch/arm/mmu.hh"
 #include "arch/arm/pmu.hh"
 #include "arch/arm/regs/misc.hh"
+#include "arch/arm/regs/misc_accessors.hh"
 #include "arch/arm/self_debug.hh"
 #include "arch/arm/system.hh"
 #include "arch/arm/utility.hh"
@@ -72,6 +73,8 @@ namespace gem5
 namespace ArmISA
 {
 
+using namespace misc_regs;
+
 namespace
 {
 
@@ -80,7 +83,7 @@ RegClass floatRegClass(FloatRegClass, FloatRegClassName, 0, debug::FloatRegs);
 
 } // anonymous namespace
 
-ISA::ISA(const Params &p) : BaseISA(p), system(NULL),
+ISA::ISA(const Params &p) : BaseISA(p, "arm"), system(NULL),
     _decoderFlavor(p.decoderFlavor), pmu(p.pmu), impdefAsNop(p.impdef_nop)
 {
     _regClasses.push_back(&flatIntRegClass);
@@ -106,6 +109,7 @@ ISA::ISA(const Params &p) : BaseISA(p), system(NULL),
     // Cache system-level properties
     if (FullSystem && system) {
         highestELIs64 = system->highestELIs64();
+        highestEL = system->highestEL();
         haveLargeAsid64 = system->haveLargeAsid64();
         physAddrRange = system->physAddrRange();
         sveVL = system->sveVL();
@@ -114,6 +118,7 @@ ISA::ISA(const Params &p) : BaseISA(p), system(NULL),
         release = system->releaseFS();
     } else {
         highestELIs64 = true; // ArmSystem::highestELIs64 does the same
+        highestEL = EL1; // ArmSystem::highestEL does the same
         haveLargeAsid64 = false;
         physAddrRange = 32;  // dummy value
         sveVL = p.sve_vl_se;
@@ -267,6 +272,8 @@ ISA::redirectRegVHE(int misc_reg)
         return currEL() == EL2 ? MISCREG_CONTEXTIDR_EL2 : misc_reg;
       case MISCREG_CNTKCTL_EL1:
         return currEL() == EL2 ? MISCREG_CNTHCTL_EL2 : misc_reg;
+      case MISCREG_MPAM1_EL1:
+        return currEL() == EL2 ? MISCREG_MPAM2_EL2 : misc_reg;
       case MISCREG_CNTP_TVAL:
       case MISCREG_CNTP_TVAL_EL0:
         if (ELIsInHost(tc, currEL())) {
@@ -356,6 +363,8 @@ ISA::redirectRegVHE(int misc_reg)
         return MISCREG_CONTEXTIDR_EL1;
       case MISCREG_CNTKCTL_EL12:
         return MISCREG_CNTKCTL_EL1;
+      case MISCREG_MPAM1_EL12:
+        return MISCREG_MPAM1_EL1;
       // _EL02 registers
       case MISCREG_CNTP_TVAL_EL02:
         return MISCREG_CNTP_TVAL_EL0;
@@ -600,6 +609,23 @@ ISA::readMiscReg(RegIndex idx)
       case MISCREG_HIFAR: // alias for secure IFAR
         return readMiscRegNoEffect(MISCREG_IFAR_S);
 
+      case MISCREG_MPAM1_EL1:
+        {
+            MPAM mpam1 = readMiscRegNoEffect(MISCREG_MPAM1_EL1);
+            mpam1.mpamEn = readRegisterNoEffect<MpamAccessor>(
+                tc, highestEL).mpamEn;
+            mpam1.el1.forcedNs = isSecure(tc) ?
+                readRegisterNoEffect<MpamAccessor>(tc, EL3).el3.forceNs : 0;
+            return mpam1;
+        }
+      case MISCREG_MPAM2_EL2:
+        {
+            MPAM mpam2 = readMiscRegNoEffect(MISCREG_MPAM2_EL2);
+            mpam2.mpamEn = readRegisterNoEffect<MpamAccessor>(
+                tc, highestEL).mpamEn;
+            return mpam2;
+        }
+
       case MISCREG_RNDR:
         tc->setReg(cc_reg::Nz, (RegVal)0);
         tc->setReg(cc_reg::C, (RegVal)0);
@@ -731,8 +757,8 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
 
                 const uint32_t ones = (uint32_t)(-1);
                 CPACR cpacrMask = 0;
-                // Only cp10, cp11, and ase are implemented, nothing else should
-                // be writable
+                // Only cp10, cp11, and ase are implemented
+                // nothing else should be writable
                 cpacrMask.cp10 = ones;
                 cpacrMask.cp11 = ones;
                 cpacrMask.asedis = ones;
@@ -1541,6 +1567,8 @@ ISA::getCurSmeVecLenInBits() const
 void
 ISA::serialize(CheckpointOut &cp) const
 {
+    BaseISA::serialize(cp);
+
     DPRINTF(Checkpoint, "Serializing Arm Misc Registers\n");
     SERIALIZE_MAPPING(miscRegs, miscRegName, NUM_PHYS_MISCREGS);
 }
diff --git a/src/arch/arm/isa.hh b/src/arch/arm/isa.hh
index 8ed37ba861..a60c887391 100644
--- a/src/arch/arm/isa.hh
+++ b/src/arch/arm/isa.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010, 2012-2023 Arm Limited
+ * Copyright (c) 2010, 2012-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -90,6 +90,7 @@ namespace ArmISA
 
         // Cached copies of system-level properties
         bool highestELIs64;
+        ExceptionLevel highestEL;
         bool haveLargeAsid64;
         uint8_t physAddrRange;
 
@@ -434,6 +435,8 @@ namespace ArmISA
 
         void globalClearExclusive() override;
         void globalClearExclusive(ExecContext *xc) override;
+
+        int64_t getVectorLengthInBytes() const override { return sveVL * 16; }
     };
 
 } // namespace ArmISA
diff --git a/src/arch/arm/isa/formats/aarch64.isa b/src/arch/arm/isa/formats/aarch64.isa
index 30f9009121..246d1a7836 100644
--- a/src/arch/arm/isa/formats/aarch64.isa
+++ b/src/arch/arm/isa/formats/aarch64.isa
@@ -1,4 +1,4 @@
-// Copyright (c) 2011-2023 Arm Limited
+// Copyright (c) 2011-2024 Arm Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -531,29 +531,29 @@ namespace Aarch64
                               case MISCREG_TLBI_ALLE1:
                               case MISCREG_TLBI_VMALLS12E1:
                               case MISCREG_TLBI_VMALLE1:
-                              case MISCREG_TLBI_VAE3_Xt:
-                              case MISCREG_TLBI_VALE3_Xt:
-                              case MISCREG_TLBI_VAE2_Xt:
-                              case MISCREG_TLBI_VALE2_Xt:
-                              case MISCREG_TLBI_VAE1_Xt:
-                              case MISCREG_TLBI_VALE1_Xt:
-                              case MISCREG_TLBI_ASIDE1_Xt:
-                              case MISCREG_TLBI_VAAE1_Xt:
-                              case MISCREG_TLBI_VAALE1_Xt:
-                              case MISCREG_TLBI_IPAS2E1_Xt:
-                              case MISCREG_TLBI_IPAS2LE1_Xt:
-                              case MISCREG_TLBI_RVAE1_Xt:
-                              case MISCREG_TLBI_RVAAE1_Xt:
-                              case MISCREG_TLBI_RVALE1_Xt:
-                              case MISCREG_TLBI_RVAALE1_Xt:
-                              case MISCREG_TLBI_RIPAS2E1_Xt:
-                              case MISCREG_TLBI_RIPAS2LE1_Xt:
-                              case MISCREG_TLBI_RVAE2_Xt:
-                              case MISCREG_TLBI_RVALE2_Xt:
-                              case MISCREG_TLBI_RVAE3_Xt:
-                              case MISCREG_TLBI_RVALE3_Xt:
+                              case MISCREG_TLBI_VAE3:
+                              case MISCREG_TLBI_VALE3:
+                              case MISCREG_TLBI_VAE2:
+                              case MISCREG_TLBI_VALE2:
+                              case MISCREG_TLBI_VAE1:
+                              case MISCREG_TLBI_VALE1:
+                              case MISCREG_TLBI_ASIDE1:
+                              case MISCREG_TLBI_VAAE1:
+                              case MISCREG_TLBI_VAALE1:
+                              case MISCREG_TLBI_IPAS2E1:
+                              case MISCREG_TLBI_IPAS2LE1:
+                              case MISCREG_TLBI_RVAE1:
+                              case MISCREG_TLBI_RVAAE1:
+                              case MISCREG_TLBI_RVALE1:
+                              case MISCREG_TLBI_RVAALE1:
+                              case MISCREG_TLBI_RIPAS2E1:
+                              case MISCREG_TLBI_RIPAS2LE1:
+                              case MISCREG_TLBI_RVAE2:
+                              case MISCREG_TLBI_RVALE2:
+                              case MISCREG_TLBI_RVAE3:
+                              case MISCREG_TLBI_RVALE3:
                                 return new Tlbi64LocalHub(
-                                  machInst, miscReg, rt);
+                                  machInst, miscReg, rt, dec.dvmEnabled);
                               case MISCREG_TLBI_ALLE3IS:
                               case MISCREG_TLBI_ALLE3OS:
                               case MISCREG_TLBI_ALLE2IS:
@@ -564,48 +564,48 @@ namespace Aarch64
                               case MISCREG_TLBI_VMALLS12E1OS:
                               case MISCREG_TLBI_VMALLE1IS:
                               case MISCREG_TLBI_VMALLE1OS:
-                              case MISCREG_TLBI_VAE3IS_Xt:
-                              case MISCREG_TLBI_VAE3OS_Xt:
-                              case MISCREG_TLBI_VALE3IS_Xt:
-                              case MISCREG_TLBI_VALE3OS_Xt:
-                              case MISCREG_TLBI_VAE2IS_Xt:
-                              case MISCREG_TLBI_VAE2OS_Xt:
-                              case MISCREG_TLBI_VALE2IS_Xt:
-                              case MISCREG_TLBI_VALE2OS_Xt:
-                              case MISCREG_TLBI_VAE1IS_Xt:
-                              case MISCREG_TLBI_VAE1OS_Xt:
-                              case MISCREG_TLBI_VALE1IS_Xt:
-                              case MISCREG_TLBI_VALE1OS_Xt:
-                              case MISCREG_TLBI_ASIDE1IS_Xt:
-                              case MISCREG_TLBI_ASIDE1OS_Xt:
-                              case MISCREG_TLBI_VAAE1IS_Xt:
-                              case MISCREG_TLBI_VAAE1OS_Xt:
-                              case MISCREG_TLBI_VAALE1IS_Xt:
-                              case MISCREG_TLBI_VAALE1OS_Xt:
-                              case MISCREG_TLBI_IPAS2E1IS_Xt:
-                              case MISCREG_TLBI_IPAS2E1OS_Xt:
-                              case MISCREG_TLBI_IPAS2LE1IS_Xt:
-                              case MISCREG_TLBI_IPAS2LE1OS_Xt:
-                              case MISCREG_TLBI_RVAE1IS_Xt:
-                              case MISCREG_TLBI_RVAE1OS_Xt:
-                              case MISCREG_TLBI_RVAAE1IS_Xt:
-                              case MISCREG_TLBI_RVAAE1OS_Xt:
-                              case MISCREG_TLBI_RVALE1IS_Xt:
-                              case MISCREG_TLBI_RVALE1OS_Xt:
-                              case MISCREG_TLBI_RVAALE1IS_Xt:
-                              case MISCREG_TLBI_RVAALE1OS_Xt:
-                              case MISCREG_TLBI_RIPAS2E1IS_Xt:
-                              case MISCREG_TLBI_RIPAS2E1OS_Xt:
-                              case MISCREG_TLBI_RIPAS2LE1IS_Xt:
-                              case MISCREG_TLBI_RIPAS2LE1OS_Xt:
-                              case MISCREG_TLBI_RVAE2IS_Xt:
-                              case MISCREG_TLBI_RVAE2OS_Xt:
-                              case MISCREG_TLBI_RVALE2IS_Xt:
-                              case MISCREG_TLBI_RVALE2OS_Xt:
-                              case MISCREG_TLBI_RVAE3IS_Xt:
-                              case MISCREG_TLBI_RVAE3OS_Xt:
-                              case MISCREG_TLBI_RVALE3IS_Xt:
-                              case MISCREG_TLBI_RVALE3OS_Xt:
+                              case MISCREG_TLBI_VAE3IS:
+                              case MISCREG_TLBI_VAE3OS:
+                              case MISCREG_TLBI_VALE3IS:
+                              case MISCREG_TLBI_VALE3OS:
+                              case MISCREG_TLBI_VAE2IS:
+                              case MISCREG_TLBI_VAE2OS:
+                              case MISCREG_TLBI_VALE2IS:
+                              case MISCREG_TLBI_VALE2OS:
+                              case MISCREG_TLBI_VAE1IS:
+                              case MISCREG_TLBI_VAE1OS:
+                              case MISCREG_TLBI_VALE1IS:
+                              case MISCREG_TLBI_VALE1OS:
+                              case MISCREG_TLBI_ASIDE1IS:
+                              case MISCREG_TLBI_ASIDE1OS:
+                              case MISCREG_TLBI_VAAE1IS:
+                              case MISCREG_TLBI_VAAE1OS:
+                              case MISCREG_TLBI_VAALE1IS:
+                              case MISCREG_TLBI_VAALE1OS:
+                              case MISCREG_TLBI_IPAS2E1IS:
+                              case MISCREG_TLBI_IPAS2E1OS:
+                              case MISCREG_TLBI_IPAS2LE1IS:
+                              case MISCREG_TLBI_IPAS2LE1OS:
+                              case MISCREG_TLBI_RVAE1IS:
+                              case MISCREG_TLBI_RVAE1OS:
+                              case MISCREG_TLBI_RVAAE1IS:
+                              case MISCREG_TLBI_RVAAE1OS:
+                              case MISCREG_TLBI_RVALE1IS:
+                              case MISCREG_TLBI_RVALE1OS:
+                              case MISCREG_TLBI_RVAALE1IS:
+                              case MISCREG_TLBI_RVAALE1OS:
+                              case MISCREG_TLBI_RIPAS2E1IS:
+                              case MISCREG_TLBI_RIPAS2E1OS:
+                              case MISCREG_TLBI_RIPAS2LE1IS:
+                              case MISCREG_TLBI_RIPAS2LE1OS:
+                              case MISCREG_TLBI_RVAE2IS:
+                              case MISCREG_TLBI_RVAE2OS:
+                              case MISCREG_TLBI_RVALE2IS:
+                              case MISCREG_TLBI_RVALE2OS:
+                              case MISCREG_TLBI_RVAE3IS:
+                              case MISCREG_TLBI_RVAE3OS:
+                              case MISCREG_TLBI_RVALE3IS:
+                              case MISCREG_TLBI_RVALE3OS:
                                 return new Tlbi64ShareableHub(
                                   machInst, miscReg, rt, dec.dvmEnabled);
                               default:
diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa b/src/arch/arm/isa/formats/sve_2nd_level.isa
index 86c174d7c4..ae8465cecc 100644
--- a/src/arch/arm/isa/formats/sve_2nd_level.isa
+++ b/src/arch/arm/isa/formats/sve_2nd_level.isa
@@ -245,6 +245,65 @@ namespace Aarch64
         return new Unknown64(machInst);
     }  // decodeSveIntMulAdd
 
+    StaticInstPtr
+    decodeSveMultiplyAccIndexed(ExtMachInst machInst)
+    {
+        RegIndex zda = (RegIndex) (uint8_t) bits(machInst, 4, 0);
+        RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t opc = (bits(machInst, 10));
+
+       switch(size) {
+            case 0b00:
+            case 0b01:
+            {
+
+                RegIndex zm_16 = (RegIndex)(uint8_t)bits(machInst, 18, 16);
+                uint8_t imm_16 = (uint8_t)(bits(machInst, 22) << 2)
+                | bits(machInst, 20, 19);
+                switch(opc)
+                    {
+                        case 0x0: return new Sve2Mlai<int16_t>(
+                            machInst, zda, zn, zm_16, imm_16);
+                        case 0x1: return new Sve2Mlsi<int16_t>(
+                            machInst, zda, zn, zm_16, imm_16);
+                    }
+            }
+                break;
+
+            case 0b10:
+            {
+
+                RegIndex zm_32 = (RegIndex)(uint8_t)bits(machInst, 18, 16);
+                uint8_t imm_32 = (uint8_t)bits(machInst, 20, 19);
+                switch(opc) {
+                    case 0x0: return new Sve2Mlai<int32_t>(
+                        machInst, zda, zn, zm_32, imm_32);
+                    case 0x1: return new Sve2Mlsi<int32_t>(
+                        machInst, zda, zn, zm_32, imm_32);
+                }
+            }
+                break;
+
+            case 0b11:
+            {
+
+                RegIndex zm_64 = (RegIndex)(uint8_t)bits(machInst, 19, 16);
+                uint8_t imm_64 = (uint8_t)bits(machInst, 20);
+                switch(opc) {
+                    case 0x0: return new Sve2Mlai<int64_t>(
+                        machInst, zda, zn, zm_64, imm_64);
+                    case 0x1: return new Sve2Mlsi<int64_t>(
+                        machInst, zda, zn, zm_64, imm_64);
+                }
+            }
+            break;
+        }
+
+        return new Unknown64(machInst);
+
+    } // decodeSveMultiplyAccIndexed
+
     StaticInstPtr
     decodeSveIntMatMulAdd(ExtMachInst machInst)
     {
@@ -509,6 +568,193 @@ namespace Aarch64
         return new Unknown64(machInst);
     }  // decodeSveIntArithUnpred
 
+    StaticInstPtr
+    decodeSveIntMulUnpred(ExtMachInst machInst)
+    {
+        RegIndex zd = (RegIndex) (uint8_t) bits(machInst, 4, 0);
+        RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5);
+        RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16);
+        uint8_t opc = bits(machInst, 11, 10);
+        uint8_t size = bits(machInst, 23, 22);
+
+        switch (opc) {
+            case 0x1:
+              if (size == 0x0) {
+                  return new SvePmul<uint8_t>(machInst, zd, zn, zm);
+              }
+              [[fallthrough]];
+            case 0x0:
+              // MUL (vectors, unpredicated)
+            case 0x2:
+              // SMULH (unpredicated)
+            case 0x3:
+              // UMULH (unpredicated)
+            default:
+              return new Unknown64(machInst);
+        }
+
+    }  // decodeSveIntMulUnpred
+
+    StaticInstPtr
+    decodeSveIntTerUnpred(ExtMachInst machInst)
+    {
+        RegIndex zdn = (RegIndex) (uint8_t) bits(machInst, 4, 0);
+        RegIndex zk = (RegIndex) (uint8_t) bits(machInst, 9, 5);
+        RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16);
+        uint8_t opc = bits(machInst, 23, 22) << 1 | bits(machInst, 10);
+
+        switch (opc) {
+          case 0x0:
+            return new SveEor3<uint64_t>(machInst, zdn, zm, zk);
+          case 0x2:
+            return new SveBcax<uint64_t>(machInst, zdn, zm, zk);
+          case 0x1:
+            // BSL
+          case 0x3:
+            // BSL1N
+          case 0x5:
+            // BSL2N
+          case 0x7:
+            // NBSL
+          default:
+            return new Unknown64(machInst);
+        }
+    }  // decodeSveIntTerUnpred
+
+    StaticInstPtr
+    decodeSve2IntMulLong(ExtMachInst machInst)
+    {
+        RegIndex zd = (RegIndex) (uint8_t) bits(machInst, 4, 0);
+        RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5);
+        RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16);
+        uint8_t opc_u_t = bits(machInst, 12, 10);
+        uint8_t size = bits(machInst, 23, 22);
+
+        switch (opc_u_t) {
+            case 0x2:
+              return decodeSveBinUnpredS2<SvePmullb>(
+                      size, machInst, zd, zn, zm);
+            case 0x3:
+              return decodeSveBinUnpredS2<SvePmullt>(
+                      size, machInst, zd, zn, zm);
+            case 0x4:
+              return decodeSveBinUnpredSigned<SveSmullb>(
+                      size, machInst, zd, zn, zm);
+            case 0x5:
+              return decodeSveBinUnpredSigned<SveSmullt>(
+                      size, machInst, zd, zn, zm);
+            case 0x6:
+              return decodeSveBinUnpredUnsigned<SveUmullb>(
+                      size, machInst, zd, zn, zm);
+            case 0x7:
+              return decodeSveBinUnpredUnsigned<SveUmullt>(
+                      size, machInst, zd, zn, zm);
+            case 0x0:
+              // SQDMULLB
+            case 0x1:
+              // SQDMULLT
+            default:
+              return new Unknown64(machInst);
+        }
+    }  // decodeSve2IntMulLong
+
+    StaticInstPtr
+    decodeSve2BitPerm(ExtMachInst machInst)
+    {
+        RegIndex zd = (RegIndex) (uint8_t) bits(machInst, 4, 0);
+        RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5);
+        RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16);
+        uint8_t opc = bits(machInst, 11, 10);
+        uint8_t size = bits(machInst, 23, 22);
+
+        switch (opc) {
+          case 0x2:
+            return decodeSveBinUnpredU<SveBgrp>(
+                    size, machInst, zd, zn, zm);
+          case 0x0:
+            // BEXT
+          case 0x1:
+            // BDEP
+          default:
+            return new Unknown64(machInst);
+        }
+    }  // decodeSve2BitPerm
+
+    StaticInstPtr
+    decodeSveIntRotImm(ExtMachInst machInst)
+    {
+        RegIndex zdn = (RegIndex) (uint8_t) bits(machInst, 4, 0);
+        RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 9, 5);
+        uint8_t imm3 = (RegIndex) (uint8_t) bits(machInst, 18, 16);
+
+        uint8_t tsize = (bits(machInst, 23, 22) << 2) | bits(machInst, 20, 19);
+        uint8_t esize = 0;
+        uint8_t size = 0;
+
+        if (tsize == 0x0) {
+            return new Unknown64(machInst);
+        } else if (tsize == 0x1) {
+            esize = 8;
+        } else if ((tsize & 0x0E) == 0x2) {
+            esize = 16;
+            size = 1;
+        } else if ((tsize & 0x0C) == 0x4) {
+            esize = 32;
+            size = 2;
+        } else if ((tsize & 0x08) == 0x8) {
+            esize = 64;
+            size = 3;
+        }
+
+        unsigned rot_am = 2 * esize - ((tsize << 3) | imm3);
+        return decodeSveBinImmDestrUnpredU<SveXar>(
+                size, machInst, zdn, zm, rot_am);
+    }  // decodeSveIntRotImm
+
+    StaticInstPtr
+    decodeSve2CryptBinConstr(ExtMachInst machInst)
+    {
+        RegIndex zd = (RegIndex) (uint8_t) bits(machInst, 4, 0);
+        RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5);
+        RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16);
+        uint8_t size = bits(machInst, 23, 22);
+        uint8_t opc = bits(machInst, 10);
+        uint8_t size_opc = (size << 1) | opc;
+
+        switch (size_opc) {
+          case 0x1:
+            return new SveRax1<uint64_t>(machInst, zd, zn, zm);
+          case 0x0:
+            // SM4EKEY
+          default:
+            return new Unknown64(machInst);
+        }
+    }  // decodeSve2CryptBinConstr
+
+    StaticInstPtr
+    decodeSve2WideIntArith(ExtMachInst machInst)
+    {
+        uint8_t op0 = bits(machInst, 14, 13);
+        switch (op0) {
+          case 0b11:
+            return decodeSve2IntMulLong(machInst);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    StaticInstPtr
+    decodeSve2Crypto(ExtMachInst machInst)
+    {
+        uint8_t op2 = bits(machInst, 12, 11);
+        switch (op2) {
+          case 0b10:
+            return decodeSve2CryptBinConstr(machInst);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
     StaticInstPtr
     decodeSveIntLogUnpred(ExtMachInst machInst)
     {
@@ -1041,12 +1287,19 @@ namespace Aarch64
     decodeSvePermUnpred(ExtMachInst machInst)
     {
         uint8_t b12_10 = bits(machInst, 12, 10);
-        if (b12_10 == 0x4) {
+        if ((b12_10 == 0x4) || (bits(machInst, 12, 11) == 0x1)) {
             unsigned size = (unsigned) bits(machInst, 23, 22);
             RegIndex zd = (RegIndex) (uint8_t) bits(machInst, 4, 0);
             RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5);
             RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16);
-            return decodeSveBinUnpredU<SveTbl>(size, machInst, zd, zn, zm);
+            if (b12_10 == 0x4) { // TBL, two sources
+                return decodeSveBinUnpredU<SveTbl>(size, machInst, zd, zn, zm);
+            } else if (bits(machInst, 10) == 0x1) { // TBX
+                return decodeSveBinUnpredU<SveTbx>(size, machInst, zd, zn, zm);
+            // } else { // TBL, three sources
+                // TBL, three sources
+            }
+            return new Unknown64(machInst);
         } else if (bits(machInst, 20, 16) == 0x0 && b12_10 == 0x6) {
             uint8_t size = bits(machInst, 23, 22);
             RegIndex rn = makeSP(
@@ -1391,7 +1644,6 @@ namespace Aarch64
         RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5);
         RegIndex pg = (RegIndex) (uint8_t) bits(machInst, 13, 10);
         RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16);
-
         uint8_t size = bits(machInst, 23, 22);
 
         return decodeSveBinConstrPredU<SveSel>(size,
@@ -3877,16 +4129,18 @@ namespace Aarch64
     }  // decodeSveMemStore
 
     StaticInstPtr
-    decodeSveMisc(ExtMachInst machInst) {
+    decodeSveMisc(ExtMachInst machInst)
+    {
         switch(bits(machInst, 13, 10)) {
-          case 0b0110: {
-              return decodeSveIntMatMulAdd(machInst);
-              break;
-          }
-          default: {
-              return new Unknown64(machInst);
-              break;
-          }
+          case 0b0110:
+            return decodeSveIntMatMulAdd(machInst);
+          case 0b1100:
+          case 0b1101:
+          case 0b1110:
+          case 0b1111:
+            return decodeSve2BitPerm(machInst);
+          default:
+            return new Unknown64(machInst);
         }
         return new Unknown64(machInst);
     }  // decodeSveMisc
@@ -3920,6 +4174,11 @@ namespace Aarch64
                 return decodeSveIntegerDotProductIndexed(machInst);
               case 0b11:
                 return decodeSveMixedSignDotProductIndexed(machInst);
+
+                // for mla/s indexed , can be renamed
+              case 0b01:
+                return decodeSveMultiplyAccIndexed(machInst);
+
               default:
                 return new Unknown64(machInst);
             }
diff --git a/src/arch/arm/isa/formats/sve_top_level.isa b/src/arch/arm/isa/formats/sve_top_level.isa
index cb390eb972..9ae075ba2d 100644
--- a/src/arch/arm/isa/formats/sve_top_level.isa
+++ b/src/arch/arm/isa/formats/sve_top_level.isa
@@ -45,7 +45,9 @@ namespace Aarch64
     StaticInstPtr decodeSveIntArithUnaryPred(ExtMachInst machInst);
     StaticInstPtr decodeSveIntMulAdd(ExtMachInst machInst);
     StaticInstPtr decodeSveIntMatMulAdd(ExtMachInst machInst);
+    StaticInstPtr decodeSveIntMulUnpred(ExtMachInst machInst);
     StaticInstPtr decodeSveIntArithUnpred(ExtMachInst machInst);
+    StaticInstPtr decodeSveIntTerUnpred(ExtMachInst machInst);
     StaticInstPtr decodeSveIntLogUnpred(ExtMachInst machInst);
     StaticInstPtr decodeSveIndexGen(ExtMachInst machInst);
     StaticInstPtr decodeSveStackAlloc(ExtMachInst machInst);
@@ -71,6 +73,12 @@ namespace Aarch64
     StaticInstPtr decodeSveIntWideImmUnpred(ExtMachInst machInst);
     StaticInstPtr decodeSveClamp(ExtMachInst machInst);
     StaticInstPtr decodeSve2Accum(ExtMachInst machInst);
+    StaticInstPtr decodeSveIntRotImm(ExtMachInst machInst);
+    StaticInstPtr decodeSve2CryptBinConstr(ExtMachInst machInst);
+    StaticInstPtr decodeSve2BitPerm(ExtMachInst machInst);
+    StaticInstPtr decodeSve2IntMulLong(ExtMachInst machInst);
+    StaticInstPtr decodeSve2WideIntArith(ExtMachInst machInst);
+    StaticInstPtr decodeSve2Crypto(ExtMachInst machInst);
 
     StaticInstPtr decodeSveIntegerDotProductUnpred(ExtMachInst machInst);
     StaticInstPtr decodeSveIntegerDotProductIndexed(ExtMachInst machInst);
@@ -129,10 +137,14 @@ namespace Aarch64
                   break;
               case 0b10:
               case 0b11:
-                  if (bits(machInst, 21) == 0b0 && op2 == 0b10) {
+                  if (bits(machInst, 21) == 0b0 && bits(op2, 1) == 0b0) {
+                      return decodeSve2WideIntArith(machInst);
+                  } else if (bits(machInst, 21) == 0b0 && op2 == 0b10) {
                       return decodeSveMisc(machInst);
                   } else if (bits(machInst, 21) == 0b0 && op2 == 0b11) {
                       return decodeSve2Accum(machInst);
+                  } else if (bits(machInst, 21) == 0b1 && bits(machInst, 15, 13) == 0b111) {
+                      return decodeSve2Crypto(machInst);
                   } else {
                       return new Unknown64(machInst);
                   }
@@ -180,7 +192,15 @@ namespace Aarch64
                 switch (b_15_14) {
                   case 0x0:
                     if (b_13) {
-                        return decodeSveIntLogUnpred(machInst);
+                        if (bits(machInst, 11)) {
+                            return decodeSveIntTerUnpred(machInst);
+                        } else {
+                            if (bits(machInst, 10)) {
+                                return decodeSveIntRotImm(machInst);
+                            } else {
+                                return decodeSveIntLogUnpred(machInst);
+                            }
+                        }
                     } else {
                         if (!bits(machInst, 30)) {
                             return decodeSveIntArithUnpred(machInst);
@@ -189,7 +209,7 @@ namespace Aarch64
                     break;
                   case 0x1:
                     if (b_13) {
-                        return new Unknown64(machInst);
+                        return decodeSveIntMulUnpred(machInst);
                     } else if (b_12) {
                         return decodeSveStackAlloc(machInst);
                     } else {
diff --git a/src/arch/arm/isa/insts/crypto.isa b/src/arch/arm/isa/insts/crypto.isa
index b6c3ad3c20..5faa4b90d5 100644
--- a/src/arch/arm/isa/insts/crypto.isa
+++ b/src/arch/arm/isa/insts/crypto.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 //
-// Copyright (c) 2018 ARM Limited
+// Copyright (c) 2018, 2024 Arm Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -38,11 +38,10 @@
 let {{
 
     cryptoEnabledCheckCode = '''
-        auto crypto_reg = xc->tcBase()->readMiscReg(MISCREG_ID_ISAR5);
-        if (!(crypto_reg & %(mask)d)) {
+        if (!HaveExt(xc->tcBase(), %(extension)s)) {
             return std::make_shared<UndefinedInstruction>(machInst, true);
         }
-    '''
+    ''' + simdEnabledCheckCode
 
     header_output = ""
     decoder_output = ""
@@ -150,7 +149,9 @@ let {{
     sha256_su0Code = "crypto.sha256Su0(output, input);"
     sha256_su1Code = "crypto.sha256Su1(output, input, input2);"
 
-    aes_enabled = cryptoEnabledCheckCode % { "mask" : 0xF0 }
+    aes_enabled = cryptoEnabledCheckCode % {
+        "extension" : "ArmExtension::FEAT_AES"
+    }
     cryptoRegRegRegInst("aese", "AESE", "SimdAesOp",
                         aes_enabled, aeseCode)
     cryptoRegRegRegInst("aesd", "AESD", "SimdAesOp",
@@ -160,7 +161,9 @@ let {{
     cryptoRegRegInst("aesimc", "AESIMC", "SimdAesMixOp",
                      aes_enabled, aesimcCode)
 
-    sha1_enabled = cryptoEnabledCheckCode % { "mask" : 0xF00 }
+    sha1_enabled = cryptoEnabledCheckCode % {
+        "extension" : "ArmExtension::FEAT_SHA1"
+    }
     cryptoRegRegRegInst("sha1c", "SHA1C", "SimdSha1HashOp",
                         sha1_enabled, sha1_cCode)
     cryptoRegRegRegInst("sha1p", "SHA1P", "SimdSha1HashOp",
@@ -174,7 +177,9 @@ let {{
     cryptoRegRegInst("sha1su1", "SHA1SU1", "SimdShaSigma2Op",
                      sha1_enabled, sha1_su1Code)
 
-    sha2_enabled = cryptoEnabledCheckCode % { "mask" : 0xF000 }
+    sha2_enabled = cryptoEnabledCheckCode % {
+        "extension" : "ArmExtension::FEAT_SHA256"
+    }
     cryptoRegRegRegInst("sha256h", "SHA256H", "SimdSha256HashOp",
                         sha2_enabled, sha256_hCode)
     cryptoRegRegRegInst("sha256h2", "SHA256H2", "SimdSha256Hash2Op",
diff --git a/src/arch/arm/isa/insts/crypto64.isa b/src/arch/arm/isa/insts/crypto64.isa
index 1ae580fa97..0ed0867136 100644
--- a/src/arch/arm/isa/insts/crypto64.isa
+++ b/src/arch/arm/isa/insts/crypto64.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 //
-// Copyright (c) 2018 ARM Limited
+// Copyright (c) 2018, 2024 Arm Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -41,11 +41,11 @@ let {{
     exec_output = ""
 
     cryptoEnabledCheckCode = '''
-        auto crypto_reg = xc->tcBase()->readMiscReg(MISCREG_ID_AA64ISAR0_EL1);
-        if (!(crypto_reg & %(mask)d)) {
+        if (!HaveExt(xc->tcBase(), %(extension)s)) {
             return std::make_shared<UndefinedInstruction>(machInst, true);
         }
-    '''
+    ''' + simd64EnabledCheckCode
+
     cryptoRegRegRegPrefix = '''
         Crypto crypto;
         RegVect srcReg1, srcReg2, destReg;
@@ -133,7 +133,9 @@ let {{
     sha256_su0Code = "crypto.sha256Su0(output, input);"
     sha256_su1Code = "crypto.sha256Su1(output, input, input2);"
 
-    aes_enabled = cryptoEnabledCheckCode % { "mask" : 0xF0 }
+    aes_enabled = cryptoEnabledCheckCode % {
+        "extension" : "ArmExtension::FEAT_AES"
+    }
     cryptoRegRegRegInst("aese", "AESE64", "SimdAesOp",
                         aes_enabled, aeseCode)
     cryptoRegRegRegInst("aesd", "AESD64", "SimdAesOp",
@@ -143,7 +145,9 @@ let {{
     cryptoRegRegInst("aesimc", "AESIMC64", "SimdAesMixOp",
                      aes_enabled, aesimcCode)
 
-    sha1_enabled = cryptoEnabledCheckCode % { "mask" : 0xF00 }
+    sha1_enabled = cryptoEnabledCheckCode % {
+        "extension" : "ArmExtension::FEAT_SHA1"
+    }
     cryptoRegRegRegInst("sha1c", "SHA1C64", "SimdSha1HashOp",
                         sha1_enabled, sha1_cCode)
     cryptoRegRegRegInst("sha1p", "SHA1P64", "SimdSha1HashOp",
@@ -157,7 +161,9 @@ let {{
     cryptoRegRegInst("sha1su1", "SHA1SU164", "SimdShaSigma2Op",
                      sha1_enabled, sha1_su1Code)
 
-    sha2_enabled = cryptoEnabledCheckCode % { "mask" : 0xF000 }
+    sha2_enabled = cryptoEnabledCheckCode % {
+        "extension" : "ArmExtension::FEAT_SHA256"
+    }
     cryptoRegRegRegInst("sha256h", "SHA256H64", "SimdSha256HashOp",
                         sha2_enabled, sha256_hCode)
     cryptoRegRegRegInst("sha256h2", "SHA256H264", "SimdSha256Hash2Op",
diff --git a/src/arch/arm/isa/insts/data64.isa b/src/arch/arm/isa/insts/data64.isa
index 87f87130ce..02ea53881d 100644
--- a/src/arch/arm/isa/insts/data64.isa
+++ b/src/arch/arm/isa/insts/data64.isa
@@ -1,6 +1,6 @@
 // -*- mode:c++ -*-
 
-// Copyright (c) 2011-2013, 2016-2023 Arm Limited
+// Copyright (c) 2011-2013, 2016-2024 Arm Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -361,15 +361,8 @@ let {{
     tlbiCode = msr_check_code + '''
         performTlbi(xc, flat_idx, XOp1);
     '''
-    msrTlbiIop = ArmInstObjParams("msr", "Tlbi64LocalHub", "TlbiOp64",
-                                  tlbiCode,
-                                  ["IsSerializeAfter", "IsNonSpeculative"])
-    header_output += MiscRegRegOp64Declare.subst(msrTlbiIop)
-    decoder_output += MiscRegRegOp64Constructor.subst(msrTlbiIop)
-    exec_output += BasicExecute.subst(msrTlbiIop)
-
     dvmCode = '''
-    if (dvmEnabled) {
+    if (%(dvmCheck)s) {
         Request::Flags memAccessFlags =
             Request::STRICT_ORDER | Request::TLBI;
 
@@ -378,9 +371,30 @@ let {{
         PendingDvm = true;
     }
     '''
+    msrTlbiIop = ArmInstObjParams("msr", "Tlbi64LocalHub", "TlbiOp64",
+        {
+            "code" : tlbiCode,
+            "dvm_code" : dvmCode %
+            {
+                "dvmCheck" : "HCR hcr = Hcr64; hcr.fb && dvmEnabled"
+            }
+        },
+        ["IsSerializeAfter", "IsNonSpeculative"])
+    header_output += DvmTlbiDeclare.subst(msrTlbiIop)
+    decoder_output += DvmTlbiConstructor.subst(msrTlbiIop)
+    exec_output += BasicExecute.subst(msrTlbiIop)
+    exec_output += DvmInitiateAcc.subst(msrTlbiIop)
+    exec_output += DvmCompleteAcc.subst(msrTlbiIop)
+
     msrTlbiSIop = ArmInstObjParams("msr", "Tlbi64ShareableHub", "TlbiOp64",
-                                  { "code" : tlbiCode, "dvm_code" : dvmCode },
-                                  ["IsSerializeAfter", "IsNonSpeculative"])
+        {
+            "code" : tlbiCode,
+            "dvm_code" : dvmCode %
+            {
+                "dvmCheck" : "dvmEnabled"
+            }
+        },
+        ["IsSerializeAfter", "IsNonSpeculative"])
     header_output += DvmTlbiDeclare.subst(msrTlbiSIop)
     decoder_output += DvmTlbiConstructor.subst(msrTlbiSIop)
     exec_output += BasicExecute.subst(msrTlbiSIop)
diff --git a/src/arch/arm/isa/insts/misc.isa b/src/arch/arm/isa/insts/misc.isa
index 9ee753e385..35b310ecb9 100644
--- a/src/arch/arm/isa/insts/misc.isa
+++ b/src/arch/arm/isa/insts/misc.isa
@@ -848,7 +848,8 @@ let {{
     '''
     unknownIop = ArmInstObjParams("unknown", "Unknown", "UnknownOp", \
                                   { "code": unknownCode,
-                                    "predicate_test": predicateTest })
+                                    "predicate_test": predicateTest },
+                                  ['IsInvalid'])
     header_output += BasicDeclare.subst(unknownIop)
     decoder_output += BasicConstructor.subst(unknownIop)
     exec_output += PredOpExecute.subst(unknownIop)
diff --git a/src/arch/arm/isa/insts/misc64.isa b/src/arch/arm/isa/insts/misc64.isa
index 5678195415..266467e9d8 100644
--- a/src/arch/arm/isa/insts/misc64.isa
+++ b/src/arch/arm/isa/insts/misc64.isa
@@ -183,7 +183,7 @@ let {{
             return std::make_shared<UndefinedInstruction>(machInst, true);
     '''
     unknown64Iop = ArmInstObjParams("unknown", "Unknown64", "UnknownOp64",
-                                    unknownCode)
+                                    unknownCode, ['IsInvalid'])
     header_output += BasicDeclare.subst(unknown64Iop)
     decoder_output += BasicConstructor64.subst(unknown64Iop)
     exec_output += BasicExecute.subst(unknown64Iop)
diff --git a/src/arch/arm/isa/insts/sve.isa b/src/arch/arm/isa/insts/sve.isa
index 9999843b59..148a31fdbc 100644
--- a/src/arch/arm/isa/insts/sve.isa
+++ b/src/arch/arm/isa/insts/sve.isa
@@ -325,6 +325,28 @@ output header {{
         }
     }
 
+
+    // Decodes binary with immediate operand, destructive, unpredicated
+    // SVE instructions, handling unsigned variants only.
+    template <template <typename T> class Base>
+    StaticInstPtr
+    decodeSveBinImmDestrUnpredU(unsigned size, ExtMachInst machInst,
+            RegIndex dest, RegIndex op1, unsigned immediate)
+    {
+        switch (size) {
+          case 0:
+            return new Base<uint8_t>(machInst, dest, op1, immediate);
+          case 1:
+            return new Base<uint16_t>(machInst, dest, op1, immediate);
+          case 2:
+            return new Base<uint32_t>(machInst, dest, op1, immediate);
+          case 3:
+            return new Base<uint64_t>(machInst, dest, op1, immediate);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
     // Decodes binary with immediate operand, destructive, predicated (merging)
     // SVE instructions, handling unsigned variants only.
     template <template <typename T> class Base>
@@ -611,6 +633,44 @@ output header {{
         }
     }
 
+    // Decodes binary, constructive, unpredicated SVE instructions.
+    // unsigned limited variants
+    template <template <typename T> class Base>
+    StaticInstPtr
+    decodeSveBinUnpredUnsigned(unsigned size, ExtMachInst machInst,
+                               RegIndex dest, RegIndex op1, RegIndex op2)
+    {
+        switch (size) {
+          case 1:
+            return new Base<uint8_t>(machInst, dest, op1, op2);
+          case 2:
+            return new Base<uint16_t>(machInst, dest, op1, op2);
+          case 3:
+            return new Base<uint32_t>(machInst, dest, op1, op2);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
+    // Decodes binary, constructive, unpredicated SVE instructions.
+    // signed limited variants
+    template <template <typename T> class Base>
+    StaticInstPtr
+    decodeSveBinUnpredSigned(unsigned size, ExtMachInst machInst,
+                             RegIndex dest, RegIndex op1, RegIndex op2)
+    {
+        switch (size) {
+          case 1:
+            return new Base<int8_t>(machInst, dest, op1, op2);
+          case 2:
+            return new Base<int16_t>(machInst, dest, op1, op2);
+          case 3:
+            return new Base<int32_t>(machInst, dest, op1, op2);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
     // Decodes binary, constructive, unpredicated SVE instructions.
     // Unsigned instructions only.
     template <template <typename T> class Base>
@@ -676,6 +736,25 @@ output header {{
         }
     }
 
+    // Decodes binary, constructive, unpredicated SVE instructions.
+    // unsigned instructions only, limited variants.
+    template <template <typename T> class Base>
+    StaticInstPtr
+    decodeSveBinUnpredS2(unsigned size, ExtMachInst machInst, RegIndex dest,
+            RegIndex op1, RegIndex op2)
+    {
+        switch (size) {
+          case 0:
+            return new Base<uint64_t>(machInst, dest, op1, op2);
+          case 1:
+            return new Base<uint8_t>(machInst, dest, op1, op2);
+          case 3:
+            return new Base<uint32_t>(machInst, dest, op1, op2);
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
     // Decodes binary, costructive, unpredicated SVE instructions, handling
     // floating-point variants only.
     template <template <typename T> class Base>
@@ -1934,8 +2013,7 @@ let {{
     def sveBinInst(name, Name, opClass, types, op, predType=PredType.NONE,
                    isDestructive=False, customIterCode=None,
                    decoder='Generic'):
-        assert not (predType in (PredType.NONE, PredType.SELECT) and
-                    isDestructive)
+        assert not ((predType == PredType.SELECT) and isDestructive)
         global header_output, exec_output, decoders
         code = sveEnabledCheckCode + '''
         unsigned eCount = ArmStaticInst::getCurSveVecLen<Element>(
@@ -1950,7 +2028,12 @@ let {{
                 code += '''
                 const Element& srcElem1 = AA64FpOp1_x[i];'''
             code += '''
-                const Element& srcElem2 = AA64FpOp2_x[i];
+                const Element& srcElem2 = AA64FpOp2_x[i];'''
+            if (predType == PredType.NONE) and isDestructive:
+                code += '''
+                Element destElem = AA64FpDestMerge_x[i];'''
+            else:
+                code += '''
                 Element destElem = 0;'''
             if predType != PredType.NONE:
                 code += '''
@@ -2762,8 +2845,8 @@ let {{
                      'class_name' : 'Sve' + Name}
         exec_output += SveOpExecDeclare.subst(substDict)
 
-    # Generate definitions for SVE TBL instructions
-    def sveTblInst(name, Name, opClass, decoder = 'Generic'):
+    # Generate definitions for SVE table lookup instructions with 2 sources
+    def sveTblInst(name, Name, opClass, decoder = 'Generic', merging = False):
         global header_output, exec_output, decoders
         code = sveEnabledCheckCode + '''
         unsigned eCount = ArmStaticInst::getCurSveVecLen<Element>(
@@ -2774,10 +2857,10 @@ let {{
             if (idx < eCount) {
                 val = AA64FpOp1_x[idx];
             } else {
-                val = 0;
+                val = %(dest_elem)s;;
             }
             AA64FpDest_x[i] = val;
-        }'''
+        }''' % {'dest_elem': 'AA64FpDestMerge_x[i]' if merging else '0'}
         iop = ArmInstObjParams(name, 'Sve' + Name, 'SveTblOp',
                 {'code': code, 'op_class': opClass}, [])
         header_output += SveBinUnpredOpDeclare.subst(iop)
@@ -2787,6 +2870,63 @@ let {{
                          'class_name' : 'Sve' + Name}
             exec_output += SveOpExecDeclare.subst(substDict)
 
+    # Generate definitions for integer add/subtract long with carry
+    def sveLongCarryInst(name, Name, opClass, decoder = 'Generic',
+            uptTop = False, subtract = False):
+        global header_output, exec_output, decoders
+        code = sveEnabledCheckCode + '''
+        unsigned eCount = ArmStaticInst::getCurSveVecLen<Element>(
+                xc->tcBase());
+        for (int i = 0; i < eCount/2; ++i) {
+            const Element& srcElem1 = AA64FpOp1_x[2*i+%(offset)s];
+            const Element& srcElem2 = AA64FpOp2_x[2*i+1];
+            const Element& srcElem3 = AA64FpDestMerge_x[2*i];
+            __uint128_t unsigned_sum = (__uint128_t)srcElem3 +
+                                       (%(op)ssrcElem1) +
+                                       (srcElem2 & 0x1);
+            AA64FpDest_x[2*i] = (Element)unsigned_sum;
+            AA64FpDest_x[2*i+1] = (Element)unsigned_sum !=
+                                  (__uint128_t)unsigned_sum;
+        }
+        ''' % {'offset': 1 if uptTop else 0,
+               'op': '~' if subtract else '',
+              }
+        iop = ArmInstObjParams(name, 'Sve' + Name, 'SveBinUnpredOp',
+                               {'code': code, 'op_class': opClass}, [])
+        header_output += SveBinUnpredOpDeclare.subst(iop)
+        exec_output += SveOpExecute.subst(iop)
+        for type in ('uint32_t', 'uint64_t'):
+            substDict = {'targs' : type,
+                         'class_name' : 'Sve' + Name}
+            exec_output += SveOpExecDeclare.subst(substDict)
+
+    # Generate definitions for long integer/poly multiplication instruction
+    def sveLongMulInst(name, Name, opClass, types, op, decoder = 'Generic',
+            uptTop = False):
+        global header_output, exec_output, decoders
+        code = sveEnabledCheckCode + '''
+        unsigned eCount = ArmStaticInst::getCurSveVecLen<Element>(
+                xc->tcBase());
+        for (int i = 0; i < eCount/2; ++i) {
+            const Element& srcElem1 = AA64FpOp1_x[2*i+%(offset)s];
+            const Element& srcElem2 = AA64FpOp2_x[2*i+%(offset)s];
+            %(op)s
+            AA64FpDest_x[2*i] = (Element)destElem;
+            AA64FpDest_x[2*i+1] = (Element)(destElem >>
+                                   (sizeof(Element) << 3));
+        }
+        ''' % {'offset': 1 if uptTop else 0,
+               'op': op,
+              }
+        iop = ArmInstObjParams(name, 'Sve' + Name, 'SveBinUnpredOp',
+                               {'code': code, 'op_class': opClass}, [])
+        header_output += SveBinUnpredOpDeclare.subst(iop)
+        exec_output += SveOpExecute.subst(iop)
+        for type in types:
+            substDict = {'targs' : type,
+                         'class_name' : 'Sve' + Name}
+            exec_output += SveOpExecDeclare.subst(substDict)
+
     # Generate definitions for SVE Unpack instructions
     def sveUnpackInst(name, Name, opClass, sdtypes, unpackHalf,
                       regType, decoder = 'Generic'):
@@ -3352,6 +3492,10 @@ let {{
     absCode = 'destElem = (Element) std::abs(srcElem1);'
     sveUnaryInst('abs', 'Abs', 'SimdAluOp', signedTypes, absCode,
                  PredType.MERGE)
+    # ADCLB
+    sveLongCarryInst('adclb', 'Adclb', 'SimdAluOp')
+    # ADCLT
+    sveLongCarryInst('adclt', 'Adclt', 'SimdAluOp', uptTop = True)
     # ADD (immediate)
     sveWideImmInst('add', 'AddImm', 'SimdAddOp', unsignedTypes, addCode, False)
     # ADD (vectors, predicated)
@@ -3457,6 +3601,29 @@ let {{
     '''
     sveBinInst('asrr', 'Asrr', 'SimdAluOp', unsignedTypes, asrrCode,
                PredType.MERGE, True)
+    # BCAX
+    bcaxCode = 'destElem ^= srcElem1 & (~srcElem2);'
+    sveBinInst('bcax', 'Bcax', 'SimdAluOp', ('uint64_t',), bcaxCode,
+                isDestructive=True)
+    # BGRP
+    bgrpCode = '''
+            int k = 0;
+            int len = sizeof(Element) * 8;
+            for(int j = 0; j < len; j++) {
+                if(((srcElem2>>j) & (Element)0x1) == ((Element)0x1)){
+                    destElem |= (((srcElem1>>j) & (Element)0x1) << k);
+                    k++;
+                }
+            }
+            k = len-1;
+            for(int j = len-1; j >= 0; j--) {
+                if(((srcElem2>>j) & ((Element)0x1)) == ((Element)0x0)){
+                    destElem |= (((srcElem1>>j) & (Element)0x1) << k);
+                    k--;
+                }
+            }
+    '''
+    sveBinInst('bgrp', 'Bgrp', 'SimdAluOp', unsignedTypes, bgrpCode)
     # BIC (vectors, predicated)
     bicCode = 'destElem = srcElem1 & ~srcElem2;'
     sveBinInst('bic', 'BicPred', 'SimdAluOp', unsignedTypes, bicCode,
@@ -3740,6 +3907,10 @@ let {{
                        eorCode)
     svePredLogicalInst('eors', 'PredEors', 'SimdPredAluOp', ('uint8_t',),
                        eorCode, isFlagSetting=True)
+    # EOR3
+    eorCode = 'destElem ^= srcElem1 ^ srcElem2;'
+    sveBinInst('eor', 'Eor3', 'SimdAluOp', ('uint64_t',), eorCode,
+                isDestructive=True)
     # EORV
     eorvCode = 'destElem ^= srcElem1;'
     sveAssocReducInst('eorv', 'Eorv', 'SimdReduceAluOp', unsignedTypes,
@@ -4244,24 +4415,30 @@ let {{
     # MLA
     mlaCode = 'destElem += srcElem1 * srcElem2;'
     sveTerInst('mla', 'Mla', 'SimdMultAccOp', signedTypes, mlaCode)
+    #indexed
+    sveTerIdxInst('mla', '2Mlai', 'SimdMultAccOp', signedTypes, mlaCode)
+
     # MLS
     mlsCode = 'destElem -= srcElem1 * srcElem2;'
     sveTerInst('mls', 'Mls', 'SimdMultAccOp', signedTypes, mlsCode)
+    #indexed
+    sveTerIdxInst('mls', '2Mlsi', 'SimdMultAccOp', signedTypes, mlsCode)
+
     # ADCLT
     adcltCode = 'res = srcElem1 + srcElem2 + carryIn;'
-    sveTerInstUnpred('adclt', 'Adclt', 'VectorIntegerArithOp', unsignedTypes,
+    sveTerInstUnpred('adclt', 'Adclt', 'SimdAddOp', unsignedTypes,
                        adcltCode, isTop=True, isAdd=True)
     # ADCLB
     adclbCode = 'res = srcElem1 + srcElem2 + carryIn;'
-    sveTerInstUnpred('adclb', 'Adclb', 'VectorIntegerArithOp', unsignedTypes,
+    sveTerInstUnpred('adclb', 'Adclb', 'SimdAddOp', unsignedTypes,
                       adclbCode, isTop=False, isAdd=True)
     # SBCLT
     sbcltCode = 'res = srcElem1 + ~(srcElem2) + carryIn;'
-    sveTerInstUnpred('sbclt', 'Sbclt', 'VectorIntegerArithOp', unsignedTypes,
+    sveTerInstUnpred('sbclt', 'Sbclt', 'SimdAddOp', unsignedTypes,
                       sbcltCode, isTop=True, isAdd=False)
     # SBCLB
     sbclbCode = 'res = srcElem1 + ~(srcElem2) + carryIn;'
-    sveTerInstUnpred('sbclb', 'Sbclb', 'VectorIntegerArithOp', unsignedTypes,
+    sveTerInstUnpred('sbclb', 'Sbclb', 'SimdAddOp', unsignedTypes,
                       sbclbCode, isTop=False, isAdd=False)
     mmlaCode = ('destElem += srcElemA * srcElemB')
     # SMMLA (vectors)
@@ -4349,6 +4526,30 @@ let {{
             pfalseCode)
     # PFIRST
     svePFirstInst('pfirst', 'Pfirst', 'SimdPredAluOp')
+    # PMUL
+    exec_output += '''
+    __uint128_t poly_mul(uint64_t srcElem1, uint64_t srcElem2)
+    {
+        __uint128_t destElem = 0;
+        __uint128_t extendedElem2 = srcElem2;
+        int i;
+        for (i=0; i < 64; i++) {
+            if (((srcElem1 >> i) & 0x1) == 0x1) {
+                destElem ^= (extendedElem2 << i);
+            }
+        }
+        return destElem;
+    }'''
+    pmulCode = 'destElem = (uint8_t)poly_mul(srcElem1, srcElem2);'
+    sveBinInst('pmul', 'Pmul', 'SimdAluOp', ('uint8_t',), pmulCode)
+    # PMULLB
+    pmullCode = '__uint128_t destElem = poly_mul(srcElem1, srcElem2);'
+    sveLongMulInst('pmullb', 'Pmullb', 'SimdAluOp',
+                   ('uint8_t','uint32_t','uint64_t',), pmullCode)
+    # PMULLT
+    sveLongMulInst('pmullt', 'Pmullt', 'SimdAluOp',
+                    ('uint8_t','uint32_t','uint64_t',),
+                    pmullCode, uptTop = True)
     # PNEXT
     svePNextInst('pnext', 'Pnext', 'SimdPredAluOp', unsignedTypes)
     # PSEL
@@ -4365,6 +4566,9 @@ let {{
     # PUNPKLO
     sveUnpackInst('punpklo', 'Punpklo', 'SimdPredAluOp', unsignedWideSDTypes,
             unpackHalf = Unpack.Low, regType = SrcRegType.Predicate)
+    # RAX1
+    rax1Code = 'destElem = srcElem1 ^ ((srcElem2 << 1) | (srcElem2 >> 63));'
+    sveBinInst('rax', 'Rax1', 'SimdAluOp', ('uint64_t',), rax1Code)
     # RBIT
     rbitCode = '''
         destElem = reverseBits(srcElem1);'''
@@ -4441,6 +4645,11 @@ let {{
     '''
     sveBinInst('sabd', 'Sabd', 'SimdAddOp', signedTypes, abdCode,
                PredType.MERGE, True)
+    # SBCLB
+    sveLongCarryInst('sbclb', 'Sbclb', 'SimdAluOp', subtract = True)
+    # SBCLT
+    sveLongCarryInst('sbclt', 'Sbclt', 'SimdAluOp', uptTop = True,
+                     subtract = True)
     # SADDV
     addvCode = 'destElem += srcElem1;'
     sveWideningAssocReducInst('saddv', 'Saddv', 'SimdReduceAddOp',
@@ -4602,6 +4811,13 @@ let {{
     destElem = do_mulh(srcElem1, srcElem2);'''
     sveBinInst('smulh', 'Smulh', 'SimdMultOp', signedTypes, mulhCode,
                PredType.MERGE, True)
+    # SMULLB
+    smullCode = 'int64_t destElem = (int64_t)srcElem1 * (int64_t)srcElem2;'
+    sveLongMulInst('smullb', 'Smullb', 'SimdAluOp',
+                   ('int8_t','int16_t','int32_t',), smullCode)
+    # SMULLT
+    sveLongMulInst('smullt', 'Smullt', 'SimdAluOp',
+                   ('int8_t','int16_t','int32_t',), smullCode, uptTop = True)
     # SPLICE
     sveSpliceInst('splice', 'Splice', 'SimdAluOp', unsignedTypes)
     # SQADD (immediate)
@@ -4787,6 +5003,8 @@ let {{
             sxtCode, PredType.MERGE)
     # TBL
     sveTblInst('tbl', 'Tbl', 'SimdAluOp')
+    # TBX
+    sveTblInst('tbx', 'Tbx', 'SimdAluOp', merging=True)
     # TRN1, TRN2 (predicates)
     trnPredIterCode = '''
         constexpr unsigned sz = sizeof(Element);
@@ -4903,6 +5121,14 @@ let {{
     # UMULH
     sveBinInst('umulh', 'Umulh', 'SimdMultOp', unsignedTypes, mulhCode,
                PredType.MERGE, True)
+    # UMULLB
+    umullCode = 'uint64_t destElem = (uint64_t)srcElem1 * (uint64_t)srcElem2;'
+    sveLongMulInst('umullb', 'Umullb', 'SimdAluOp',
+                   ('uint8_t','uint16_t','uint32_t',), umullCode)
+    # UMULLT
+    sveLongMulInst('umullt', 'Umullt', 'SimdAluOp',
+                   ('uint8_t','uint16_t','uint32_t',), umullCode,
+                    uptTop = True)
     # UQADD (immediate)
     uqaddCode = '''
             destElem = srcElem1 + srcElem2;
@@ -5123,6 +5349,13 @@ let {{
             Ffr_ub[i] = POp1_ub[i];
         }'''
     svePredWriteFfrInst('wrffr', 'Wrffr', 'SimdPredAluOp', wrffrCode, False)
+    # XAR
+    xarCode = '''
+            destElem = AA64FpDestMerge_x[i] ^ srcElem1;
+            destElem = ((destElem >> srcElem2) |
+                    (destElem << (sizeof(Element) * 8 - srcElem2)));
+    '''
+    sveBinImmInst('xar', 'Xar', 'SimdAluOp', unsignedTypes, xarCode)
     # ZIP1, ZIP2 (predicates)
     zipPredIterCode = '''
         constexpr unsigned sz = sizeof(Element);
diff --git a/src/arch/arm/kvm/gic.cc b/src/arch/arm/kvm/gic.cc
index 0fd73c4b28..a6c1111a63 100644
--- a/src/arch/arm/kvm/gic.cc
+++ b/src/arch/arm/kvm/gic.cc
@@ -252,8 +252,11 @@ RegVal
 KvmKernelGicV3::readCpu(const ArmISA::Affinity &aff,
                         ArmISA::MiscRegIndex misc_reg)
 {
-    auto sys_reg = ArmISA::encodeAArch64SysReg(misc_reg).packed();
-    return getGicReg<RegVal>(KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS, aff, sys_reg);
+    std::optional<ArmISA::MiscRegNum64> sys_reg =
+        ArmISA::encodeAArch64SysReg(misc_reg);
+    panic_if(!sys_reg.has_value(), "Invalid system register");
+    return getGicReg<RegVal>(KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS, aff,
+                             sys_reg.value().packed());
 }
 
 void
@@ -274,8 +277,11 @@ KvmKernelGicV3::writeCpu(const ArmISA::Affinity &aff,
                          ArmISA::MiscRegIndex misc_reg,
                          RegVal data)
 {
-    auto sys_reg = ArmISA::encodeAArch64SysReg(misc_reg).packed();
-    setGicReg<RegVal>(KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS, aff, sys_reg, data);
+    std::optional<ArmISA::MiscRegNum64> sys_reg =
+        ArmISA::encodeAArch64SysReg(misc_reg);
+    panic_if(!sys_reg.has_value(), "Invalid system register");
+    setGicReg<RegVal>(KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS, aff,
+                      sys_reg.value().packed(), data);
 }
 
 template <class Types>
diff --git a/src/arch/arm/mmu.cc b/src/arch/arm/mmu.cc
index 956f95d3b3..44c3bb3d97 100644
--- a/src/arch/arm/mmu.cc
+++ b/src/arch/arm/mmu.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013, 2016-2023 Arm Limited
+ * Copyright (c) 2010-2013, 2016-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -41,6 +41,7 @@
 #include "arch/arm/mmu.hh"
 
 #include "arch/arm/isa.hh"
+#include "arch/arm/mpam.hh"
 #include "arch/arm/reg_abi.hh"
 #include "arch/arm/stage2_lookup.hh"
 #include "arch/arm/table_walker.hh"
@@ -178,11 +179,9 @@ MMU::translateFunctional(ThreadContext *tc, Addr va, Addr &pa)
     lookup_data.asn = state.asid;
     lookup_data.ignoreAsn = false;
     lookup_data.vmid = state.vmid;
-    lookup_data.hyp = state.isHyp;
     lookup_data.secure = state.isSecure;
     lookup_data.functional = true;
-    lookup_data.targetEL = state.aarch64 ? state.aarch64EL : EL1;
-    lookup_data.inHost = false;
+    lookup_data.targetRegime = state.currRegime;
     lookup_data.mode = BaseMMU::Read;
 
     TlbEntry *e = tlb->multiLookup(lookup_data);
@@ -201,6 +200,28 @@ MMU::invalidateMiscReg()
     s2State.computeAddrTop.flush();
 }
 
+Fault
+MMU::testAndFinalize(const RequestPtr &req,
+                     ThreadContext *tc, Mode mode,
+                     TlbEntry* te, CachedState &state) const
+{
+    // If we don't have a valid tlb entry it means virtual memory
+    // is not enabled
+    auto domain = te ? te-> domain : TlbEntry::DomainType::NoAccess;
+
+    mpam::tagRequest(tc, req, mode == Execute);
+
+    // Check for a tester generated address fault
+    Fault fault = testTranslation(req, mode, domain, state);
+    if (fault != NoFault) {
+        return fault;
+    } else {
+        // Now that we checked no fault has been generated in the
+        // translation process, we can finalize the physical address
+        return finalizePhysical(req, tc, mode);
+    }
+}
+
 Fault
 MMU::finalizePhysical(const RequestPtr &req,
                       ThreadContext *tc, Mode mode) const
@@ -240,7 +261,7 @@ MMU::translateSe(const RequestPtr &req, ThreadContext *tc, Mode mode,
     Addr vaddr_tainted = req->getVaddr();
     Addr vaddr = 0;
     if (state.aarch64) {
-        vaddr = purifyTaggedAddr(vaddr_tainted, tc, state.aarch64EL,
+        vaddr = purifyTaggedAddr(vaddr_tainted, tc, state.exceptionLevel,
             static_cast<TCR>(state.ttbcr), mode==Execute, state);
     } else {
         vaddr = vaddr_tainted;
@@ -480,12 +501,12 @@ MMU::checkPermissions64(TlbEntry *te, const RequestPtr &req, Mode mode,
     // * It is a data cache invalidate (dc ivac) which requires write
     //   permissions to the VA, or
     // * It is executed from EL0
-    if (req->isCacheClean() && state.aarch64EL != EL0 && !state.isStage2) {
+    if (req->isCacheClean() && state.exceptionLevel != EL0 && !state.isStage2) {
         return NoFault;
     }
 
     Addr vaddr_tainted = req->getVaddr();
-    Addr vaddr = purifyTaggedAddr(vaddr_tainted, tc, state.aarch64EL,
+    Addr vaddr = purifyTaggedAddr(vaddr_tainted, tc, state.exceptionLevel,
         static_cast<TCR>(state.ttbcr), mode==Execute, state);
 
     Request::Flags flags = req->getFlags();
@@ -582,7 +603,7 @@ std::pair<bool, bool>
 MMU::s2PermBits64(TlbEntry *te, const RequestPtr &req, Mode mode,
                   ThreadContext *tc, CachedState &state, bool r, bool w, bool x)
 {
-    assert(ArmSystem::haveEL(tc, EL2) && state.aarch64EL != EL2);
+    assert(ArmSystem::haveEL(tc, EL2) && state.exceptionLevel != EL2);
 
     // In stage 2 we use the hypervisor access permission bits.
     // The following permissions are described in ARM DDI 0487A.f
@@ -639,8 +660,8 @@ MMU::s1PermBits64(TlbEntry *te, const RequestPtr &req, Mode mode,
         return std::make_pair(false, false);
     }
 
-    ExceptionLevel regime = !is_priv ? EL0 : state.aarch64EL;
-    if (hasUnprivRegime(regime, state)) {
+    TranslationRegime regime = !is_priv ? TranslationRegime::EL10 : state.currRegime;
+    if (hasUnprivRegime(regime)) {
         bool pr = false;
         bool pw = false;
         bool ur = false;
@@ -702,34 +723,23 @@ MMU::s1PermBits64(TlbEntry *te, const RequestPtr &req, Mode mode,
 }
 
 bool
-MMU::hasUnprivRegime(ExceptionLevel el, bool e2h)
+MMU::hasUnprivRegime(TranslationRegime regime)
 {
-    switch (el) {
-      case EL0:
-      case EL1:
-        // EL1&0
+    switch (regime) {
+      case TranslationRegime::EL10:
+      case TranslationRegime::EL20:
         return true;
-      case EL2:
-        // EL2&0 or EL2
-        return e2h;
-      case EL3:
       default:
         return false;
     }
 }
 
-bool
-MMU::hasUnprivRegime(ExceptionLevel el, CachedState &state)
-{
-    return hasUnprivRegime(el, state.hcr.e2h);
-}
-
 bool
 MMU::faultPAN(ThreadContext *tc, uint8_t ap, const RequestPtr &req, Mode mode,
               const bool is_priv, CachedState &state)
 {
     bool exception = false;
-    switch (state.aarch64EL) {
+    switch (state.exceptionLevel) {
       case EL0:
         break;
       case EL1:
@@ -779,7 +789,8 @@ MMU::checkPAN(ThreadContext *tc, uint8_t ap, const RequestPtr &req, Mode mode,
 }
 
 Addr
-MMU::purifyTaggedAddr(Addr vaddr_tainted, ThreadContext *tc, ExceptionLevel el,
+MMU::purifyTaggedAddr(Addr vaddr_tainted, ThreadContext *tc,
+                      ExceptionLevel el,
                       TCR tcr, bool is_inst, CachedState& state)
 {
     const bool selbit = bits(vaddr_tainted, 55);
@@ -802,7 +813,8 @@ MMU::translateMmuOff(ThreadContext *tc, const RequestPtr &req, Mode mode,
     // security state of the processor
     if (state.isSecure)
         req->setFlags(Request::SECURE);
-
+    else
+        req->clearFlags(Request::SECURE);
     if (state.aarch64) {
         bool selbit = bits(vaddr, 55);
         TCR tcr1 = tc->readMiscReg(MISCREG_TCR_EL1);
@@ -839,9 +851,7 @@ MMU::translateMmuOff(ThreadContext *tc, const RequestPtr &req, Mode mode,
     bool dc = (HaveExt(tc, ArmExtension::FEAT_VHE) &&
                state.hcr.e2h == 1 && state.hcr.tge == 1) ? 0: state.hcr.dc;
     bool i_cacheability = state.sctlr.i && !state.sctlr.m;
-    if (state.isStage2 || !dc || state.isSecure ||
-       (state.isHyp && !(tran_type & S1CTran))) {
-
+    if (state.isStage2 || !dc || state.exceptionLevel == EL2) {
         temp_te.mtype      = is_fetch ? TlbEntry::MemoryType::Normal
                                       : TlbEntry::MemoryType::StronglyOrdered;
         temp_te.innerAttrs = i_cacheability? 0x2: 0x0;
@@ -862,7 +872,7 @@ MMU::translateMmuOff(ThreadContext *tc, const RequestPtr &req, Mode mode,
             state.isStage2);
     setAttr(temp_te.attributes);
 
-    return testTranslation(req, mode, TlbEntry::DomainType::NoAccess, state);
+    return testAndFinalize(req, tc, mode, nullptr, state);
 }
 
 Fault
@@ -908,6 +918,8 @@ MMU::translateMmuOn(ThreadContext* tc, const RequestPtr &req, Mode mode,
 
         if (state.isSecure && !te->ns) {
             req->setFlags(Request::SECURE);
+        } else {
+            req->clearFlags(Request::SECURE);
         }
         if (!is_fetch && fault == NoFault &&
             (vaddr & mask(flags & AlignmentMask)) &&
@@ -923,18 +935,11 @@ MMU::translateMmuOn(ThreadContext* tc, const RequestPtr &req, Mode mode,
                     tranMethod);
         }
 
-        // Check for a trickbox generated address fault
         if (fault == NoFault)
-            fault = testTranslation(req, mode, te->domain, state);
+            fault = testAndFinalize(req, tc, mode, te, state);
     }
 
-    if (fault == NoFault) {
-        // Don't try to finalize a physical address unless the
-        // translation has completed (i.e., there is a table entry).
-        return te ? finalizePhysical(req, tc, mode) : NoFault;
-    } else {
-        return fault;
-    }
+    return fault;
 }
 
 Fault
@@ -949,7 +954,7 @@ MMU::translateFs(const RequestPtr &req, ThreadContext *tc, Mode mode,
     Addr vaddr_tainted = req->getVaddr();
     Addr vaddr = 0;
     if (state.aarch64) {
-        vaddr = purifyTaggedAddr(vaddr_tainted, tc, state.aarch64EL,
+        vaddr = purifyTaggedAddr(vaddr_tainted, tc, state.exceptionLevel,
             static_cast<TCR>(state.ttbcr), mode==Execute, state);
     } else {
         vaddr = vaddr_tainted;
@@ -1205,44 +1210,23 @@ MMU::CachedState::updateMiscReg(ThreadContext *tc,
     ArmTranslationType tran_type)
 {
     cpsr = tc->readMiscReg(MISCREG_CPSR);
+    hcr = tc->readMiscReg(MISCREG_HCR_EL2);
+    scr = tc->readMiscReg(MISCREG_SCR_EL3);
 
     // Dependencies: SCR/SCR_EL3, CPSR
     isSecure = ArmISA::isSecure(tc) &&
         !(tran_type & HypMode) && !(tran_type & S1S2NsTran);
 
-    aarch64EL = tranTypeEL(cpsr, tran_type);
+    exceptionLevel = tranTypeEL(cpsr, scr, tran_type);
+    currRegime = translationRegime(tc, exceptionLevel);
     aarch64 = isStage2 ?
         ELIs64(tc, EL2) :
-        ELIs64(tc, aarch64EL == EL0 ? EL1 : aarch64EL);
+        ELIs64(tc, translationEl(currRegime));
 
-    hcr = tc->readMiscReg(MISCREG_HCR_EL2);
     if (aarch64) {  // AArch64
         // determine EL we need to translate in
-        switch (aarch64EL) {
-          case EL0:
-            if (HaveExt(tc, ArmExtension::FEAT_VHE) &&
-                hcr.tge == 1 && hcr.e2h == 1) {
-                // VHE code for EL2&0 regime
-                sctlr = tc->readMiscReg(MISCREG_SCTLR_EL2);
-                ttbcr = tc->readMiscReg(MISCREG_TCR_EL2);
-                uint64_t ttbr_asid = ttbcr.a1 ?
-                    tc->readMiscReg(MISCREG_TTBR1_EL2) :
-                    tc->readMiscReg(MISCREG_TTBR0_EL2);
-                asid = bits(ttbr_asid,
-                            (mmu->haveLargeAsid64 && ttbcr.as) ? 63 : 55, 48);
-
-            } else {
-                sctlr = tc->readMiscReg(MISCREG_SCTLR_EL1);
-                ttbcr = tc->readMiscReg(MISCREG_TCR_EL1);
-                uint64_t ttbr_asid = ttbcr.a1 ?
-                    tc->readMiscReg(MISCREG_TTBR1_EL1) :
-                    tc->readMiscReg(MISCREG_TTBR0_EL1);
-                asid = bits(ttbr_asid,
-                            (mmu->haveLargeAsid64 && ttbcr.as) ? 63 : 55, 48);
-
-            }
-            break;
-          case EL1:
+        switch (currRegime) {
+          case TranslationRegime::EL10:
             {
                 sctlr = tc->readMiscReg(MISCREG_SCTLR_EL1);
                 ttbcr = tc->readMiscReg(MISCREG_TCR_EL1);
@@ -1253,44 +1237,41 @@ MMU::CachedState::updateMiscReg(ThreadContext *tc,
                             (mmu->haveLargeAsid64 && ttbcr.as) ? 63 : 55, 48);
             }
             break;
-          case EL2:
-            sctlr = tc->readMiscReg(MISCREG_SCTLR_EL2);
-            ttbcr = tc->readMiscReg(MISCREG_TCR_EL2);
-            if (hcr.e2h == 1) {
+          case TranslationRegime::EL20:
+            {
                 // VHE code for EL2&0 regime
+                sctlr = tc->readMiscReg(MISCREG_SCTLR_EL2);
+                ttbcr = tc->readMiscReg(MISCREG_TCR_EL2);
                 uint64_t ttbr_asid = ttbcr.a1 ?
                     tc->readMiscReg(MISCREG_TTBR1_EL2) :
                     tc->readMiscReg(MISCREG_TTBR0_EL2);
                 asid = bits(ttbr_asid,
                             (mmu->haveLargeAsid64 && ttbcr.as) ? 63 : 55, 48);
-            } else {
-                asid = -1;
             }
             break;
-          case EL3:
+          case TranslationRegime::EL2:
+            sctlr = tc->readMiscReg(MISCREG_SCTLR_EL2);
+            ttbcr = tc->readMiscReg(MISCREG_TCR_EL2);
+            asid = -1;
+            break;
+          case TranslationRegime::EL3:
             sctlr = tc->readMiscReg(MISCREG_SCTLR_EL3);
             ttbcr = tc->readMiscReg(MISCREG_TCR_EL3);
             asid = -1;
             break;
         }
 
-        scr = tc->readMiscReg(MISCREG_SCR_EL3);
-        isPriv = aarch64EL != EL0;
+        isPriv = exceptionLevel != EL0;
         if (mmu->release()->has(ArmExtension::VIRTUALIZATION)) {
             vmid = getVMID(tc);
-            isHyp = aarch64EL == EL2;
-            isHyp |= tran_type & HypMode;
-            isHyp &= (tran_type & S1S2NsTran) == 0;
-            isHyp &= (tran_type & S1CTran)    == 0;
             bool vm = hcr.vm;
             if (HaveExt(tc, ArmExtension::FEAT_VHE) &&
                 hcr.e2h == 1 && hcr.tge ==1) {
                 vm = 0;
             }
 
-            if (hcr.e2h == 1 && (aarch64EL == EL2
-                                  || (hcr.tge ==1 && aarch64EL == EL0))) {
-                isHyp = true;
+            if (hcr.e2h == 1 && (exceptionLevel == EL2
+                                  || (hcr.tge ==1 && exceptionLevel == EL0))) {
                 directToStage2 = false;
                 stage2Req      = false;
                 stage2DescReq  = false;
@@ -1298,18 +1279,17 @@ MMU::CachedState::updateMiscReg(ThreadContext *tc,
             // Work out if we should skip the first stage of translation and go
             // directly to stage 2. This value is cached so we don't have to
             // compute it for every translation.
-                bool sec = !isSecure || (isSecure && IsSecureEL2Enabled(tc));
+                const bool el2_enabled = EL2Enabled(tc);
                 stage2Req = isStage2 ||
-                            (vm && !isHyp && sec &&
-                             !(tran_type & S1CTran) && (aarch64EL < EL2) &&
-                             !(tran_type & S1E1Tran)); // <--- FIX THIS HACK
-                stage2DescReq = isStage2 ||  (vm && !isHyp && sec &&
-                                (aarch64EL < EL2));
+                    (vm && exceptionLevel < EL2 && el2_enabled &&
+                        !(tran_type & S1CTran) &&
+                        !(tran_type & S1E1Tran)); // <--- FIX THIS HACK
+                stage2DescReq = isStage2 ||
+                    (vm && exceptionLevel < EL2 && el2_enabled);
                 directToStage2 = !isStage2 && stage2Req && !sctlr.m;
             }
         } else {
             vmid           = 0;
-            isHyp          = false;
             directToStage2 = false;
             stage2Req      = false;
             stage2DescReq  = false;
@@ -1319,7 +1299,6 @@ MMU::CachedState::updateMiscReg(ThreadContext *tc,
                                  !isSecure));
         ttbcr  = tc->readMiscReg(snsBankedIndex(MISCREG_TTBCR, tc,
                                  !isSecure));
-        scr    = tc->readMiscReg(MISCREG_SCR_EL3);
         isPriv = cpsr.mode != MODE_USER;
         if (longDescFormatInUse(tc)) {
             uint64_t ttbr_asid = tc->readMiscReg(
@@ -1338,29 +1317,25 @@ MMU::CachedState::updateMiscReg(ThreadContext *tc,
                                !isSecure));
         dacr = tc->readMiscReg(snsBankedIndex(MISCREG_DACR, tc,
                                !isSecure));
-        hcr  = tc->readMiscReg(MISCREG_HCR_EL2);
 
         if (mmu->release()->has(ArmExtension::VIRTUALIZATION)) {
             vmid   = bits(tc->readMiscReg(MISCREG_VTTBR), 55, 48);
-            isHyp  = cpsr.mode == MODE_HYP;
-            isHyp |=  tran_type & HypMode;
-            isHyp &= (tran_type & S1S2NsTran) == 0;
-            isHyp &= (tran_type & S1CTran)    == 0;
-            if (isHyp) {
+            if (exceptionLevel == EL2) {
                 sctlr = tc->readMiscReg(MISCREG_HSCTLR);
             }
             // Work out if we should skip the first stage of translation and go
             // directly to stage 2. This value is cached so we don't have to
             // compute it for every translation.
-            bool sec = !isSecure || (isSecure && IsSecureEL2Enabled(tc));
-            stage2Req      = hcr.vm && !isStage2 && !isHyp && sec &&
-                             !(tran_type & S1CTran);
-            stage2DescReq  = hcr.vm && !isStage2 && !isHyp && sec;
-            directToStage2 = stage2Req && !sctlr.m;
+            const bool el2_enabled = EL2Enabled(tc);
+            stage2Req = isStage2 ||
+                (hcr.vm && exceptionLevel < EL2 && el2_enabled &&
+                    !(tran_type & S1CTran));
+            stage2DescReq  = isStage2 ||
+                (hcr.vm && exceptionLevel < EL2 && el2_enabled);
+            directToStage2 = !isStage2 && stage2Req && !sctlr.m;
         } else {
             vmid           = 0;
             stage2Req      = false;
-            isHyp          = false;
             directToStage2 = false;
             stage2DescReq  = false;
         }
@@ -1370,7 +1345,7 @@ MMU::CachedState::updateMiscReg(ThreadContext *tc,
 }
 
 ExceptionLevel
-MMU::tranTypeEL(CPSR cpsr, ArmTranslationType type)
+MMU::tranTypeEL(CPSR cpsr, SCR scr, ArmTranslationType type)
 {
     switch (type) {
       case S1E0Tran:
@@ -1379,18 +1354,21 @@ MMU::tranTypeEL(CPSR cpsr, ArmTranslationType type)
 
       case S1E1Tran:
       case S12E1Tran:
+      case S1S2NsTran:
         return EL1;
 
       case S1E2Tran:
+      case HypMode:
         return EL2;
 
       case S1E3Tran:
         return EL3;
 
-      case NormalTran:
       case S1CTran:
-      case S1S2NsTran:
-      case HypMode:
+        return currEL(cpsr) == EL3 && scr.ns == 0 ?
+           EL3 : EL1;
+
+      case NormalTran:
         return currEL(cpsr);
 
       default:
@@ -1409,9 +1387,9 @@ MMU::getTE(TlbEntry **te, const RequestPtr &req, ThreadContext *tc, Mode mode,
 }
 
 TlbEntry*
-MMU::lookup(Addr va, uint16_t asid, vmid_t vmid, bool hyp, bool secure,
-            bool functional, bool ignore_asn, ExceptionLevel target_el,
-            bool in_host, bool stage2, BaseMMU::Mode mode)
+MMU::lookup(Addr va, uint16_t asid, vmid_t vmid, bool secure,
+            bool functional, bool ignore_asn, TranslationRegime regime,
+            bool stage2, BaseMMU::Mode mode)
 {
     TLB *tlb = getTlb(mode, stage2);
 
@@ -1421,11 +1399,9 @@ MMU::lookup(Addr va, uint16_t asid, vmid_t vmid, bool hyp, bool secure,
     lookup_data.asn = asid;
     lookup_data.ignoreAsn = ignore_asn;
     lookup_data.vmid = vmid;
-    lookup_data.hyp = hyp;
     lookup_data.secure = secure;
     lookup_data.functional = functional;
-    lookup_data.targetEL = target_el;
-    lookup_data.inHost = in_host;
+    lookup_data.targetRegime = regime;
     lookup_data.mode = mode;
 
     return tlb->multiLookup(lookup_data);
@@ -1445,16 +1421,17 @@ MMU::getTE(TlbEntry **te, const RequestPtr &req, ThreadContext *tc, Mode mode,
 
     Addr vaddr_tainted = req->getVaddr();
     Addr vaddr = 0;
-    ExceptionLevel target_el = state.aarch64 ? state.aarch64EL : EL1;
+    TranslationRegime regime = state.currRegime;
+
     if (state.aarch64) {
-        vaddr = purifyTaggedAddr(vaddr_tainted, tc, target_el,
+        vaddr = purifyTaggedAddr(vaddr_tainted, tc, state.exceptionLevel,
             static_cast<TCR>(state.ttbcr), mode==Execute, state);
     } else {
         vaddr = vaddr_tainted;
     }
 
-    *te = lookup(vaddr, state.asid, state.vmid, state.isHyp, is_secure, false,
-                 false, target_el, false, state.isStage2, mode);
+    *te = lookup(vaddr, state.asid, state.vmid, is_secure, false,
+                 false, regime, state.isStage2, mode);
 
     if (!isCompleteTranslation(*te)) {
         if (req->isPrefetch()) {
@@ -1474,7 +1451,7 @@ MMU::getTE(TlbEntry **te, const RequestPtr &req, ThreadContext *tc, Mode mode,
 
         Fault fault;
         fault = getTableWalker(mode, state.isStage2)->walk(
-            req, tc, state.asid, state.vmid, state.isHyp, mode,
+            req, tc, state.asid, state.vmid, mode,
             translation, timing, functional, is_secure,
             tran_type, state.stage2DescReq, *te);
 
@@ -1483,8 +1460,8 @@ MMU::getTE(TlbEntry **te, const RequestPtr &req, ThreadContext *tc, Mode mode,
             return fault;
         }
 
-        *te = lookup(vaddr, state.asid, state.vmid, state.isHyp, is_secure,
-                     true, false, target_el, false, state.isStage2, mode);
+        *te = lookup(vaddr, state.asid, state.vmid, is_secure,
+                     true, false, regime, state.isStage2, mode);
         assert(*te);
     }
     return NoFault;
@@ -1593,12 +1570,16 @@ MMU::setTestInterface(SimObject *_ti)
         TlbTestInterface *ti(dynamic_cast<TlbTestInterface *>(_ti));
         fatal_if(!ti, "%s is not a valid ARM TLB tester\n", _ti->name());
         test = ti;
+        itbWalker->setTestInterface(test);
+        dtbWalker->setTestInterface(test);
+        itbStage2Walker->setTestInterface(test);
+        dtbStage2Walker->setTestInterface(test);
     }
 }
 
 Fault
 MMU::testTranslation(const RequestPtr &req, Mode mode,
-                     TlbEntry::DomainType domain, CachedState &state)
+                     TlbEntry::DomainType domain, CachedState &state) const
 {
     if (!test || !req->hasSize() || req->getSize() == 0 ||
         req->isCacheMaintenance()) {
@@ -1608,28 +1589,6 @@ MMU::testTranslation(const RequestPtr &req, Mode mode,
     }
 }
 
-Fault
-MMU::testWalk(Addr pa, Addr size, Addr va, bool is_secure, Mode mode,
-              TlbEntry::DomainType domain, LookupLevel lookup_level,
-              bool stage2)
-{
-    return testWalk(pa, size, va, is_secure, mode, domain, lookup_level,
-        stage2 ? s2State : s1State);
-}
-
-Fault
-MMU::testWalk(Addr pa, Addr size, Addr va, bool is_secure, Mode mode,
-              TlbEntry::DomainType domain, LookupLevel lookup_level,
-              CachedState &state)
-{
-    if (!test) {
-        return NoFault;
-    } else {
-        return test->walkCheck(pa, size, va, is_secure, state.isPriv, mode,
-                               domain, lookup_level);
-    }
-}
-
 MMU::Stats::Stats(statistics::Group *parent)
   : statistics::Group(parent),
     ADD_STAT(alignFaults, statistics::units::Count::get(),
diff --git a/src/arch/arm/mmu.hh b/src/arch/arm/mmu.hh
index 089edbd0ed..36d34faade 100644
--- a/src/arch/arm/mmu.hh
+++ b/src/arch/arm/mmu.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013, 2016, 2019-2022 Arm Limited
+ * Copyright (c) 2010-2013, 2016, 2019-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -144,12 +144,12 @@ class MMU : public BaseMMU
             isStage2 = rhs.isStage2;
             cpsr = rhs.cpsr;
             aarch64 = rhs.aarch64;
-            aarch64EL = EL0;
+            exceptionLevel = rhs.exceptionLevel;
+            currRegime = rhs.currRegime;
             sctlr = rhs.sctlr;
             scr = rhs.scr;
             isPriv = rhs.isPriv;
             isSecure = rhs.isSecure;
-            isHyp = rhs.isHyp;
             ttbcr = rhs.ttbcr;
             asid = rhs.asid;
             vmid = rhs.vmid;
@@ -179,12 +179,12 @@ class MMU : public BaseMMU
         bool isStage2 = false;
         CPSR cpsr = 0;
         bool aarch64 = false;
-        ExceptionLevel aarch64EL = EL0;
+        ExceptionLevel exceptionLevel = EL0;
+        TranslationRegime currRegime = TranslationRegime::EL10;
         SCTLR sctlr = 0;
         SCR scr = 0;
         bool isPriv = false;
         bool isSecure = false;
-        bool isHyp = false;
         TTBCR ttbcr = 0;
         uint16_t asid = 0;
         vmid_t vmid = 0;
@@ -388,9 +388,9 @@ class MMU : public BaseMMU
      * a specific translation type. If the translation type doesn't
      * specify an EL, we use the current EL.
      */
-    static ExceptionLevel tranTypeEL(CPSR cpsr, ArmTranslationType type);
+    static ExceptionLevel tranTypeEL(CPSR cpsr, SCR scr, ArmTranslationType type);
 
-    static bool hasUnprivRegime(ExceptionLevel el, bool e2h);
+    static bool hasUnprivRegime(TranslationRegime regime);
 
   public:
     /** Lookup an entry in the TLB
@@ -398,18 +398,16 @@ class MMU : public BaseMMU
      * @param asn context id/address space id to use
      * @param vmid The virtual machine ID used for stage 2 translation
      * @param secure if the lookup is secure
-     * @param hyp if the lookup is done from hyp mode
      * @param functional if the lookup should modify state
      * @param ignore_asn if on lookup asn should be ignored
-     * @param target_el selecting the translation regime
-     * @param in_host if we are in host (EL2&0 regime)
+     * @param target_regime selecting the translation regime
      * @param mode to differentiate between read/writes/fetches.
      * @return pointer to TLB entry if it exists
      */
-    TlbEntry *lookup(Addr vpn, uint16_t asn, vmid_t vmid, bool hyp,
+    TlbEntry *lookup(Addr vpn, uint16_t asn, vmid_t vmid,
                      bool secure, bool functional,
-                     bool ignore_asn, ExceptionLevel target_el,
-                     bool in_host, bool stage2, BaseMMU::Mode mode);
+                     bool ignore_asn, TranslationRegime target_regime,
+                     bool stage2, BaseMMU::Mode mode);
 
     Fault getTE(TlbEntry **te, const RequestPtr &req,
                 ThreadContext *tc, Mode mode,
@@ -448,8 +446,6 @@ class MMU : public BaseMMU
     bool faultPAN(ThreadContext *tc, uint8_t ap, const RequestPtr &req,
                   Mode mode, const bool is_priv, CachedState &state);
 
-    bool hasUnprivRegime(ExceptionLevel el, CachedState &state);
-
     std::pair<bool, bool> s1PermBits64(
         TlbEntry *te, const RequestPtr &req, Mode mode,
         ThreadContext *tc, CachedState &state, bool r, bool w, bool x);
@@ -464,13 +460,7 @@ class MMU : public BaseMMU
     void setTestInterface(SimObject *ti);
 
     Fault testTranslation(const RequestPtr &req, Mode mode,
-                          TlbEntry::DomainType domain, CachedState &state);
-    Fault testWalk(Addr pa, Addr size, Addr va, bool is_secure, Mode mode,
-                   TlbEntry::DomainType domain,
-                   LookupLevel lookup_level, bool stage2);
-    Fault testWalk(Addr pa, Addr size, Addr va, bool is_secure, Mode mode,
-                   TlbEntry::DomainType domain,
-                   LookupLevel lookup_level, CachedState &state);
+                          TlbEntry::DomainType domain, CachedState &state) const;
 
   protected:
     bool checkWalkCache() const;
@@ -481,6 +471,10 @@ class MMU : public BaseMMU
         ThreadContext *tc, ArmTranslationType tran_type,
         bool stage2);
 
+    Fault testAndFinalize(const RequestPtr &req,
+                          ThreadContext *tc, Mode mode,
+                          TlbEntry *te, CachedState &state) const;
+
   protected:
     ContextID miscRegContext;
 
diff --git a/src/arch/arm/mpam.cc b/src/arch/arm/mpam.cc
new file mode 100644
index 0000000000..4c618d2f85
--- /dev/null
+++ b/src/arch/arm/mpam.cc
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2024 Arm Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <optional>
+
+#include "arch/arm/mpam.hh"
+
+#include "arch/arm/regs/misc.hh"
+#include "arch/arm/regs/misc_accessors.hh"
+#include "arch/arm/regs/misc_types.hh"
+#include "arch/arm/system.hh"
+#include "arch/arm/types.hh"
+#include "arch/arm/utility.hh"
+#include "cpu/base.hh"
+#include "cpu/thread_context.hh"
+#include "debug/MPAM.hh"
+
+namespace gem5::ArmISA::mpam
+{
+
+std::unique_ptr<ExtensionBase>
+PartitionFieldExtension::clone() const
+{
+    return std::make_unique<PartitionFieldExtension>(*this);
+}
+
+uint64_t
+PartitionFieldExtension::getPartitionID() const
+{
+    return this->_partitionID;
+}
+
+uint64_t
+PartitionFieldExtension::getPartitionMonitoringID() const
+{
+    return this->_partitionMonitoringID;
+}
+
+bool
+PartitionFieldExtension::getMpamNS() const
+{
+    return this->_ns;
+}
+
+void
+PartitionFieldExtension::setPartitionID(uint64_t id)
+{
+    this->_partitionID = id;
+}
+
+void
+PartitionFieldExtension::setPartitionMonitoringID(uint64_t id)
+{
+    this->_partitionMonitoringID = id;
+}
+
+void
+PartitionFieldExtension::setMpamNS(bool ns)
+{
+    this->_ns = ns;
+}
+
+
+namespace
+{
+
+using namespace misc_regs;
+
+static PartID
+getPARTID(ThreadContext *tc, ExceptionLevel el, bool ind)
+{
+    MPAM reg = readRegister<MpamAccessor>(tc, el);
+    return ind ? reg.partidI : reg.partidD;
+}
+
+static PMG
+getPMG(ThreadContext *tc, ExceptionLevel el, bool ind)
+{
+    MPAM reg = readRegister<MpamAccessor>(tc, el);
+    return ind ? reg.pmgI : reg.pmgD;
+}
+
+static bool
+useVirtualPartitions(ThreadContext *tc, ExceptionLevel el, MPAMIDR mpamidr)
+{
+    const MPAMHCR mpamhcr = tc->readMiscReg(MISCREG_MPAMHCR_EL2);
+     return mpamidr.hasHcr && EL2Enabled(tc) &&
+        ((el == EL0 && !ELIsInHost(tc, EL0) && mpamhcr.el0Vpmen) || // EL0 case
+         (el == EL1 && mpamhcr.el1Vpmen));                          // EL1 case
+}
+
+static PartID
+mapVpmv(ThreadContext *tc, PartID vpartid)
+{
+    uint8_t reg_index = vpartid / 4;
+    uint8_t reg_field = vpartid % 4;
+
+    // Register field size in bits
+    size_t reg_field_size = sizeof(PartID) * 8;
+
+    // LSB of the register field (Every field is 16bits)
+    uint8_t lsb = reg_field * reg_field_size;
+    uint8_t msb = lsb + reg_field_size - 1;
+
+    const RegVal vpmv = tc->readMiscReg(MISCREG_MPAMVPM0_EL2 + reg_index);
+    return bits(vpmv, msb ,lsb);
+}
+
+static std::optional<PartID>
+virtToPhysPart(ThreadContext *tc, PartID vpartid, MPAMIDR mpamidr)
+{
+    // vpmrMax refers to the register index. Extract vpartid max
+    const uint8_t vpartid_max = (mpamidr.vpmrMax << 2) + 3;
+    const RegVal mpam_vpmv = tc->readMiscReg(MISCREG_MPAMVPMV_EL2);
+
+    if (vpartid > vpartid_max) {
+        vpartid = vpartid % (vpartid_max + 1);
+    }
+
+    PartID phys_partid = 0;
+    if (bits(mpam_vpmv, vpartid)) {
+        // Valid mapping entry for virtual partition vpartid
+        phys_partid =  mapVpmv(tc, vpartid);
+    } else if (bits(mpam_vpmv, 0)) {
+        // Default virtual partition valid
+        phys_partid = mapVpmv(tc, 0);
+    } else {
+        // Error
+        return std::nullopt;
+    }
+
+    return phys_partid > mpamidr.partidMax ?
+        std::nullopt : std::make_optional(phys_partid);
+}
+
+static std::optional<PartID>
+genPARTID(ThreadContext *tc, ExceptionLevel el, bool ind)
+{
+    const MPAMIDR mpamidr = tc->readMiscReg(MISCREG_MPAMIDR_EL1);
+    auto partid = getPARTID(tc, el, ind);
+
+    if (partid > mpamidr.partidMax) {
+        return std::nullopt;
+    } else if (useVirtualPartitions(tc, el, mpamidr)) {
+        return virtToPhysPart(tc, partid, mpamidr);
+    } else {
+        return partid;
+    }
+}
+
+static std::optional<PMG>
+genPMG(ThreadContext *tc, ExceptionLevel el, bool ind)
+{
+    const MPAMIDR mpamidr = tc->readMiscReg(MISCREG_MPAMIDR_EL1);
+    PMG pgroup = getPMG(tc, el, ind);
+    return pgroup > mpamidr.pmgMax ? std::nullopt : std::make_optional(pgroup);
+}
+
+static bool
+isEnabled(ThreadContext *tc)
+{
+    MPAM reg = readRegister<MpamAccessor>(tc, ArmSystem::highestEL(tc));
+    return reg.mpamEn;
+}
+
+static std::shared_ptr<PartitionFieldExtension>
+genExtensionDefault()
+{
+    // tag with partID data
+    auto ext = std::make_shared<PartitionFieldExtension>();
+    ext->setPartitionID(DEFAULT_PARTITION_ID);
+    ext->setPartitionMonitoringID(DEFAULT_PARTITION_MONITORING_ID);
+    return ext;
+}
+
+static std::shared_ptr<PartitionFieldExtension>
+genExtension(ThreadContext *tc, bool ind)
+{
+    ExceptionLevel curr_el = currEL(tc);
+    const MPAMHCR mpamhcr = tc->readMiscReg(MISCREG_MPAMHCR_EL2);
+    const HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
+
+    bool gstplk = curr_el == EL0 && EL2Enabled(tc) &&
+        mpamhcr.gstappPlk && !hcr.tge;
+    if (gstplk) {
+        curr_el = EL1;
+    }
+
+    // tag with partID data
+    auto ext = std::make_shared<
+        PartitionFieldExtension>();
+
+    auto part_id = genPARTID(tc, curr_el, ind).value_or(
+        DEFAULT_PARTITION_ID);
+    auto part_mon_id = genPMG(tc, curr_el, ind).value_or(
+        DEFAULT_PARTITION_MONITORING_ID);
+
+    ext->setPartitionID(part_id);
+    ext->setPartitionMonitoringID(part_mon_id);
+    return ext;
+}
+
+} // namespace
+
+void
+tagRequest(ThreadContext *tc, const RequestPtr &req, bool ind)
+{
+    if (!HaveExt(tc, ArmExtension::FEAT_MPAM) || !isEnabled(tc))
+        return;
+
+    const MPAMIDR mpamidr = tc->readMiscReg(MISCREG_MPAMIDR_EL1);
+    const MPAM mpam3 = tc->readMiscReg(MISCREG_MPAM3_EL3);
+    auto ext = mpamidr.hasSdeflt && mpam3.el3.sdeflt && isSecure(tc) ?
+        genExtensionDefault() :
+        genExtension(tc, ind);
+
+    DPRINTFS(MPAM, tc->getCpuPtr(),
+             "MPAM Tagging req %#x => PART_ID: %d, PART_MON_ID: %d\n",
+             req->getPaddr(),
+             ext->getPartitionID(),
+             ext->getPartitionMonitoringID());
+
+    bool mpam_ns = !isSecure(tc);
+    if (!mpam_ns && mpam3.el3.forceNs) {
+        mpam_ns = true;
+    }
+
+    ext->setMpamNS(mpam_ns);
+
+    req->setExtension(ext);
+}
+
+} // namespace gem5::ArmISA::mpam
diff --git a/src/arch/arm/mpam.hh b/src/arch/arm/mpam.hh
new file mode 100644
index 0000000000..a3d47e48ef
--- /dev/null
+++ b/src/arch/arm/mpam.hh
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2024 Arm Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_ARM_MPAM_HH__
+#define __ARCH_ARM_MPAM_HH__
+
+#include "base/extensible.hh"
+#include "mem/packet.hh"
+#include "mem/request.hh"
+
+namespace gem5::ArmISA::mpam
+{
+
+const uint64_t DEFAULT_PARTITION_ID = 0;
+const uint64_t DEFAULT_PARTITION_MONITORING_ID = 0;
+
+class PartitionFieldExtension : public Extension<Request,
+                                                 PartitionFieldExtension>
+{
+  public:
+    std::unique_ptr<ExtensionBase> clone() const override;
+    PartitionFieldExtension() = default;
+
+    /**
+    * _partitionID getter
+    * @return extension Partition ID
+    */
+    uint64_t getPartitionID() const;
+
+    /**
+    * _partitionMonitoringID getter
+    * @return extension Partition Monitoring ID
+    */
+    uint64_t getPartitionMonitoringID() const;
+
+    /**
+    * MPAM_NS getter
+    * @return True if targeting Non-Secure MPAM partition
+    */
+    bool getMpamNS() const;
+
+
+    /**
+    * _partitionID setter
+    * @param id Partition ID to set for the extension
+    */
+    void setPartitionID(uint64_t id);
+
+    /**
+    * _partitionMonitoringID setter
+    * @param id Partition Monitoring ID to set for the extension
+    */
+    void setPartitionMonitoringID(uint64_t id);
+
+    /**
+    * MPAM_NS setter
+    * @param ns True if targeting Non-Secure MPAM partition
+    */
+    void setMpamNS(bool ns);
+
+  private:
+    uint64_t _partitionID = DEFAULT_PARTITION_ID;
+    uint64_t _partitionMonitoringID = DEFAULT_PARTITION_MONITORING_ID;
+    bool _ns = true;
+};
+
+/** Partition ID data type */
+using PartID = uint16_t;
+/** Partition Manager data type */
+using PMG = uint8_t;
+
+/** Tag a memory request with MPAM information
+ * @param tc pointer to the ThreadContext
+ * @param req reference to the pointer of the memory request to be tagged
+ * @param ind "instruction not data" to differentiate between
+ *            a fetch request and a data memory access
+ */
+void tagRequest(ThreadContext *tc, const RequestPtr &req, bool ind);
+
+} // namespace gem5::ArmISA::mpam
+
+#endif // __ARCH_ARM_MPAM_HH__
diff --git a/src/arch/arm/pagetable.hh b/src/arch/arm/pagetable.hh
index a1e9028e8f..af70adb2ba 100644
--- a/src/arch/arm/pagetable.hh
+++ b/src/arch/arm/pagetable.hh
@@ -198,16 +198,12 @@ struct TlbEntry : public Serializable
         bool ignoreAsn = false;
         // The virtual machine ID used for stage 2 translation
         vmid_t vmid = 0;
-        // if the lookup is done from hyp mode
-        bool hyp = false;
         // if the lookup is secure
         bool secure = false;
         // if the lookup should modify state
         bool functional = false;
         // selecting the translation regime
-        ExceptionLevel targetEL = EL0;
-        // if we are in host (EL2&0 regime)
-        bool inHost = false;
+        TranslationRegime targetRegime = TranslationRegime::EL10;
         // mode to differentiate between read/writes/fetches.
         BaseMMU::Mode mode = BaseMMU::Read;
     };
@@ -238,7 +234,6 @@ struct TlbEntry : public Serializable
     // True if the long descriptor format is used for this entry (LPAE only)
     bool longDescFormat; // @todo use this in the update attribute bethod
 
-    bool isHyp;
     bool global;
     bool valid;
 
@@ -246,8 +241,8 @@ struct TlbEntry : public Serializable
     bool ns;
     // True if the entry was brought in from a non-secure page table
     bool nstid;
-    // Exception level on insert, AARCH64 EL0&1, AARCH32 -> el=1
-    ExceptionLevel el;
+    // Translation regime on insert, AARCH64 EL0&1, AARCH32 -> el=1
+    TranslationRegime regime;
     // This is used to distinguish between instruction and data entries
     // in unified TLBs
     TypeTLB type;
@@ -273,9 +268,9 @@ struct TlbEntry : public Serializable
          asid(_asn), vmid(0), tg(Grain4KB), N(0),
          innerAttrs(0), outerAttrs(0), ap(read_only ? 0x3 : 0), hap(0x3),
          domain(DomainType::Client),  mtype(MemoryType::StronglyOrdered),
-         longDescFormat(false), isHyp(false), global(false), valid(true),
-         ns(true), nstid(true), el(EL0), type(TypeTLB::unified),
-         partial(false),
+         longDescFormat(false), global(false), valid(true),
+         ns(true), nstid(true), regime(TranslationRegime::EL10),
+         type(TypeTLB::unified), partial(false),
          nonCacheable(uncacheable),
          shareable(false), outerShareable(false), xn(0), pxn(0)
     {
@@ -291,9 +286,9 @@ struct TlbEntry : public Serializable
          asid(0), vmid(0), tg(ReservedGrain), N(0),
          innerAttrs(0), outerAttrs(0), ap(0), hap(0x3),
          domain(DomainType::Client), mtype(MemoryType::StronglyOrdered),
-         longDescFormat(false), isHyp(false), global(false), valid(false),
-         ns(true), nstid(true), el(EL0), type(TypeTLB::unified),
-         partial(false), nonCacheable(false),
+         longDescFormat(false), global(false), valid(false),
+         ns(true), nstid(true), regime(TranslationRegime::EL10),
+         type(TypeTLB::unified), partial(false), nonCacheable(false),
          shareable(false), outerShareable(false), xn(0), pxn(0)
     {
         // no restrictions by default, hap = 0x3
@@ -332,14 +327,14 @@ struct TlbEntry : public Serializable
     {
         bool match = false;
         if (valid && matchAddress(lookup) &&
-            (lookup.secure == !nstid) && (lookup.hyp == isHyp))
+            (lookup.secure == !nstid))
         {
-            match = checkELMatch(lookup.targetEL, lookup.inHost);
+            match = checkRegime(lookup.targetRegime);
 
             if (match && !lookup.ignoreAsn) {
                 match = global || (lookup.asn == asid);
             }
-            if (match && useVMID(lookup.targetEL, lookup.inHost)) {
+            if (match && useVMID(lookup.targetRegime)) {
                 match = lookup.vmid == vmid;
             }
         }
@@ -347,21 +342,9 @@ struct TlbEntry : public Serializable
     }
 
     bool
-    checkELMatch(ExceptionLevel target_el, bool in_host) const
+    checkRegime(TranslationRegime target_regime) const
     {
-        switch (target_el) {
-            case EL3:
-                return el == EL3;
-            case EL2:
-              {
-                return el == EL2 || (el == EL0 && in_host);
-              }
-            case EL1:
-            case EL0:
-                return (el == EL0) || (el == EL1);
-            default:
-                return false;
-        }
+        return regime == target_regime;
     }
 
     Addr
@@ -422,9 +405,10 @@ struct TlbEntry : public Serializable
     std::string
     print() const
     {
-        return csprintf("%#x, asn %d vmn %d hyp %d ppn %#x size: %#x ap:%d "
-                        "ns:%d nstid:%d g:%d el:%d", vpn << N, asid, vmid,
-                        isHyp, pfn << N, size, ap, ns, nstid, global, el);
+        return csprintf("%#x, asn %d vmn %d ppn %#x size: %#x ap:%d "
+                        "ns:%d nstid:%d g:%d regime:%s", vpn << N, asid, vmid,
+                        pfn << N, size, ap, ns, nstid, global,
+                        regimeToStr(regime));
     }
 
     void
@@ -436,7 +420,6 @@ struct TlbEntry : public Serializable
         SERIALIZE_SCALAR(vpn);
         SERIALIZE_SCALAR(asid);
         SERIALIZE_SCALAR(vmid);
-        SERIALIZE_SCALAR(isHyp);
         SERIALIZE_SCALAR(N);
         SERIALIZE_SCALAR(global);
         SERIALIZE_SCALAR(valid);
@@ -467,7 +450,6 @@ struct TlbEntry : public Serializable
         UNSERIALIZE_SCALAR(vpn);
         UNSERIALIZE_SCALAR(asid);
         UNSERIALIZE_SCALAR(vmid);
-        UNSERIALIZE_SCALAR(isHyp);
         UNSERIALIZE_SCALAR(N);
         UNSERIALIZE_SCALAR(global);
         UNSERIALIZE_SCALAR(valid);
diff --git a/src/arch/arm/reg_abi.hh b/src/arch/arm/reg_abi.hh
index e892166c5e..34fe44c16d 100644
--- a/src/arch/arm/reg_abi.hh
+++ b/src/arch/arm/reg_abi.hh
@@ -31,6 +31,7 @@
 #include <vector>
 
 #include "base/logging.hh"
+#include "sim/pseudo_inst.hh"
 #include "sim/syscall_abi.hh"
 
 namespace gem5
@@ -75,6 +76,21 @@ struct Argument<ABI, Arg,
     }
 };
 
+template <>
+struct Argument<ArmISA::RegABI32, pseudo_inst::GuestAddr>
+{
+    using ABI = ArmISA::RegABI32;
+    using Arg = pseudo_inst::GuestAddr;
+
+    static Arg
+    get(ThreadContext *tc, typename ABI::State &state)
+    {
+        panic_if(state + 1 >= ABI::ArgumentRegs.size(),
+                "Ran out of syscall argument registers.");
+        return (Arg)bits(tc->getReg(ABI::ArgumentRegs[state++]), 31, 0);
+    }
+};
+
 } // namespace guest_abi
 } // namespace gem5
 
diff --git a/src/arch/arm/regs/misc.cc b/src/arch/arm/regs/misc.cc
index e768edeee3..6cbbd2f6fe 100644
--- a/src/arch/arm/regs/misc.cc
+++ b/src/arch/arm/regs/misc.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013, 2015-2023 Arm Limited
+ * Copyright (c) 2010-2013, 2015-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -41,11 +41,12 @@
 
 #include "arch/arm/insts/misc64.hh"
 #include "arch/arm/isa.hh"
+#include "base/bitfield.hh"
 #include "base/logging.hh"
 #include "cpu/thread_context.hh"
 #include "dev/arm/gic_v3_cpu_interface.hh"
-#include "sim/full_system.hh"
 #include "params/ArmISA.hh"
+#include "sim/full_system.hh"
 
 namespace gem5
 {
@@ -735,7 +736,6 @@ checkFaultAccessAArch64SysReg(MiscRegIndex reg, CPSR cpsr,
 std::vector<struct MiscRegLUTEntry> lookUpMiscReg(NUM_MISCREGS);
 
 namespace {
-
 // The map is translating a MiscRegIndex into AArch64 system register
 // numbers (op0, op1, crn, crm, op2)
 std::unordered_map<MiscRegIndex, MiscRegNum64> idxToMiscRegNum;
@@ -754,35 +754,35 @@ std::unordered_map<MiscRegNum64, MiscRegIndex> miscRegNumToIdx{
     { MiscRegNum64(1, 0, 7, 10, 2), MISCREG_DC_CSW_Xt },
     { MiscRegNum64(1, 0, 7, 14, 2), MISCREG_DC_CISW_Xt },
     { MiscRegNum64(1, 0, 8, 1, 0), MISCREG_TLBI_VMALLE1OS },
-    { MiscRegNum64(1, 0, 8, 1, 1), MISCREG_TLBI_VAE1OS_Xt },
-    { MiscRegNum64(1, 0, 8, 1, 2), MISCREG_TLBI_ASIDE1OS_Xt },
-    { MiscRegNum64(1, 0, 8, 1, 3), MISCREG_TLBI_VAAE1OS_Xt },
-    { MiscRegNum64(1, 0, 8, 1, 5), MISCREG_TLBI_VALE1OS_Xt },
-    { MiscRegNum64(1, 0, 8, 1, 7), MISCREG_TLBI_VAALE1OS_Xt },
-    { MiscRegNum64(1, 0, 8, 2, 1), MISCREG_TLBI_RVAE1IS_Xt },
-    { MiscRegNum64(1, 0, 8, 2, 3), MISCREG_TLBI_RVAAE1IS_Xt },
-    { MiscRegNum64(1, 0, 8, 2, 5), MISCREG_TLBI_RVALE1IS_Xt },
-    { MiscRegNum64(1, 0, 8, 2, 7), MISCREG_TLBI_RVAALE1IS_Xt },
+    { MiscRegNum64(1, 0, 8, 1, 1), MISCREG_TLBI_VAE1OS },
+    { MiscRegNum64(1, 0, 8, 1, 2), MISCREG_TLBI_ASIDE1OS },
+    { MiscRegNum64(1, 0, 8, 1, 3), MISCREG_TLBI_VAAE1OS },
+    { MiscRegNum64(1, 0, 8, 1, 5), MISCREG_TLBI_VALE1OS },
+    { MiscRegNum64(1, 0, 8, 1, 7), MISCREG_TLBI_VAALE1OS },
+    { MiscRegNum64(1, 0, 8, 2, 1), MISCREG_TLBI_RVAE1IS },
+    { MiscRegNum64(1, 0, 8, 2, 3), MISCREG_TLBI_RVAAE1IS },
+    { MiscRegNum64(1, 0, 8, 2, 5), MISCREG_TLBI_RVALE1IS },
+    { MiscRegNum64(1, 0, 8, 2, 7), MISCREG_TLBI_RVAALE1IS },
     { MiscRegNum64(1, 0, 8, 3, 0), MISCREG_TLBI_VMALLE1IS },
-    { MiscRegNum64(1, 0, 8, 3, 1), MISCREG_TLBI_VAE1IS_Xt },
-    { MiscRegNum64(1, 0, 8, 3, 2), MISCREG_TLBI_ASIDE1IS_Xt },
-    { MiscRegNum64(1, 0, 8, 3, 3), MISCREG_TLBI_VAAE1IS_Xt },
-    { MiscRegNum64(1, 0, 8, 3, 5), MISCREG_TLBI_VALE1IS_Xt },
-    { MiscRegNum64(1, 0, 8, 3, 7), MISCREG_TLBI_VAALE1IS_Xt },
-    { MiscRegNum64(1, 0, 8, 5, 1), MISCREG_TLBI_RVAE1OS_Xt },
-    { MiscRegNum64(1, 0, 8, 5, 3), MISCREG_TLBI_RVAAE1OS_Xt },
-    { MiscRegNum64(1, 0, 8, 5, 5), MISCREG_TLBI_RVALE1OS_Xt },
-    { MiscRegNum64(1, 0, 8, 5, 7), MISCREG_TLBI_RVAALE1OS_Xt },
-    { MiscRegNum64(1, 0, 8, 6, 1), MISCREG_TLBI_RVAE1_Xt },
-    { MiscRegNum64(1, 0, 8, 6, 3), MISCREG_TLBI_RVAAE1_Xt },
-    { MiscRegNum64(1, 0, 8, 6, 5), MISCREG_TLBI_RVALE1_Xt },
-    { MiscRegNum64(1, 0, 8, 6, 7), MISCREG_TLBI_RVAALE1_Xt },
+    { MiscRegNum64(1, 0, 8, 3, 1), MISCREG_TLBI_VAE1IS },
+    { MiscRegNum64(1, 0, 8, 3, 2), MISCREG_TLBI_ASIDE1IS },
+    { MiscRegNum64(1, 0, 8, 3, 3), MISCREG_TLBI_VAAE1IS },
+    { MiscRegNum64(1, 0, 8, 3, 5), MISCREG_TLBI_VALE1IS },
+    { MiscRegNum64(1, 0, 8, 3, 7), MISCREG_TLBI_VAALE1IS },
+    { MiscRegNum64(1, 0, 8, 5, 1), MISCREG_TLBI_RVAE1OS },
+    { MiscRegNum64(1, 0, 8, 5, 3), MISCREG_TLBI_RVAAE1OS },
+    { MiscRegNum64(1, 0, 8, 5, 5), MISCREG_TLBI_RVALE1OS },
+    { MiscRegNum64(1, 0, 8, 5, 7), MISCREG_TLBI_RVAALE1OS },
+    { MiscRegNum64(1, 0, 8, 6, 1), MISCREG_TLBI_RVAE1 },
+    { MiscRegNum64(1, 0, 8, 6, 3), MISCREG_TLBI_RVAAE1 },
+    { MiscRegNum64(1, 0, 8, 6, 5), MISCREG_TLBI_RVALE1 },
+    { MiscRegNum64(1, 0, 8, 6, 7), MISCREG_TLBI_RVAALE1 },
     { MiscRegNum64(1, 0, 8, 7, 0), MISCREG_TLBI_VMALLE1 },
-    { MiscRegNum64(1, 0, 8, 7, 1), MISCREG_TLBI_VAE1_Xt },
-    { MiscRegNum64(1, 0, 8, 7, 2), MISCREG_TLBI_ASIDE1_Xt },
-    { MiscRegNum64(1, 0, 8, 7, 3), MISCREG_TLBI_VAAE1_Xt },
-    { MiscRegNum64(1, 0, 8, 7, 5), MISCREG_TLBI_VALE1_Xt },
-    { MiscRegNum64(1, 0, 8, 7, 7), MISCREG_TLBI_VAALE1_Xt },
+    { MiscRegNum64(1, 0, 8, 7, 1), MISCREG_TLBI_VAE1 },
+    { MiscRegNum64(1, 0, 8, 7, 2), MISCREG_TLBI_ASIDE1 },
+    { MiscRegNum64(1, 0, 8, 7, 3), MISCREG_TLBI_VAAE1 },
+    { MiscRegNum64(1, 0, 8, 7, 5), MISCREG_TLBI_VALE1 },
+    { MiscRegNum64(1, 0, 8, 7, 7), MISCREG_TLBI_VAALE1 },
     { MiscRegNum64(1, 3, 7, 4, 1), MISCREG_DC_ZVA_Xt },
     { MiscRegNum64(1, 3, 7, 5, 1), MISCREG_IC_IVAU_Xt },
     { MiscRegNum64(1, 3, 7, 10, 1), MISCREG_DC_CVAC_Xt },
@@ -794,56 +794,56 @@ std::unordered_map<MiscRegNum64, MiscRegIndex> miscRegNumToIdx{
     { MiscRegNum64(1, 4, 7, 8, 5), MISCREG_AT_S12E1W_Xt },
     { MiscRegNum64(1, 4, 7, 8, 6), MISCREG_AT_S12E0R_Xt },
     { MiscRegNum64(1, 4, 7, 8, 7), MISCREG_AT_S12E0W_Xt },
-    { MiscRegNum64(1, 4, 8, 0, 1), MISCREG_TLBI_IPAS2E1IS_Xt },
-    { MiscRegNum64(1, 4, 8, 0, 2), MISCREG_TLBI_RIPAS2E1IS_Xt },
-    { MiscRegNum64(1, 4, 8, 0, 5), MISCREG_TLBI_IPAS2LE1IS_Xt },
+    { MiscRegNum64(1, 4, 8, 0, 1), MISCREG_TLBI_IPAS2E1IS },
+    { MiscRegNum64(1, 4, 8, 0, 2), MISCREG_TLBI_RIPAS2E1IS },
+    { MiscRegNum64(1, 4, 8, 0, 5), MISCREG_TLBI_IPAS2LE1IS },
     { MiscRegNum64(1, 4, 8, 1, 0), MISCREG_TLBI_ALLE2OS },
-    { MiscRegNum64(1, 4, 8, 1, 1), MISCREG_TLBI_VAE2OS_Xt },
+    { MiscRegNum64(1, 4, 8, 1, 1), MISCREG_TLBI_VAE2OS },
     { MiscRegNum64(1, 4, 8, 1, 4), MISCREG_TLBI_ALLE1OS },
-    { MiscRegNum64(1, 4, 8, 1, 5), MISCREG_TLBI_VALE2OS_Xt },
+    { MiscRegNum64(1, 4, 8, 1, 5), MISCREG_TLBI_VALE2OS },
     { MiscRegNum64(1, 4, 8, 1, 6), MISCREG_TLBI_VMALLS12E1OS },
-    { MiscRegNum64(1, 4, 8, 0, 6), MISCREG_TLBI_RIPAS2LE1IS_Xt },
-    { MiscRegNum64(1, 4, 8, 2, 1), MISCREG_TLBI_RVAE2IS_Xt },
-    { MiscRegNum64(1, 4, 8, 2, 5), MISCREG_TLBI_RVALE2IS_Xt },
+    { MiscRegNum64(1, 4, 8, 0, 6), MISCREG_TLBI_RIPAS2LE1IS },
+    { MiscRegNum64(1, 4, 8, 2, 1), MISCREG_TLBI_RVAE2IS },
+    { MiscRegNum64(1, 4, 8, 2, 5), MISCREG_TLBI_RVALE2IS },
     { MiscRegNum64(1, 4, 8, 3, 0), MISCREG_TLBI_ALLE2IS },
-    { MiscRegNum64(1, 4, 8, 3, 1), MISCREG_TLBI_VAE2IS_Xt },
+    { MiscRegNum64(1, 4, 8, 3, 1), MISCREG_TLBI_VAE2IS },
     { MiscRegNum64(1, 4, 8, 3, 4), MISCREG_TLBI_ALLE1IS },
-    { MiscRegNum64(1, 4, 8, 3, 5), MISCREG_TLBI_VALE2IS_Xt },
+    { MiscRegNum64(1, 4, 8, 3, 5), MISCREG_TLBI_VALE2IS },
     { MiscRegNum64(1, 4, 8, 3, 6), MISCREG_TLBI_VMALLS12E1IS },
-    { MiscRegNum64(1, 4, 8, 4, 0), MISCREG_TLBI_IPAS2E1OS_Xt },
-    { MiscRegNum64(1, 4, 8, 4, 1), MISCREG_TLBI_IPAS2E1_Xt },
-    { MiscRegNum64(1, 4, 8, 4, 2), MISCREG_TLBI_RIPAS2E1_Xt },
-    { MiscRegNum64(1, 4, 8, 4, 3), MISCREG_TLBI_RIPAS2E1OS_Xt },
-    { MiscRegNum64(1, 4, 8, 4, 4), MISCREG_TLBI_IPAS2LE1OS_Xt },
-    { MiscRegNum64(1, 4, 8, 4, 5), MISCREG_TLBI_IPAS2LE1_Xt },
-    { MiscRegNum64(1, 4, 8, 4, 6), MISCREG_TLBI_RIPAS2LE1_Xt },
-    { MiscRegNum64(1, 4, 8, 4, 7), MISCREG_TLBI_RIPAS2LE1OS_Xt },
-    { MiscRegNum64(1, 4, 8, 5, 1), MISCREG_TLBI_RVAE2OS_Xt },
-    { MiscRegNum64(1, 4, 8, 5, 5), MISCREG_TLBI_RVALE2OS_Xt },
-    { MiscRegNum64(1, 4, 8, 6, 1), MISCREG_TLBI_RVAE2_Xt },
-    { MiscRegNum64(1, 4, 8, 6, 5), MISCREG_TLBI_RVALE2_Xt },
+    { MiscRegNum64(1, 4, 8, 4, 0), MISCREG_TLBI_IPAS2E1OS },
+    { MiscRegNum64(1, 4, 8, 4, 1), MISCREG_TLBI_IPAS2E1 },
+    { MiscRegNum64(1, 4, 8, 4, 2), MISCREG_TLBI_RIPAS2E1 },
+    { MiscRegNum64(1, 4, 8, 4, 3), MISCREG_TLBI_RIPAS2E1OS },
+    { MiscRegNum64(1, 4, 8, 4, 4), MISCREG_TLBI_IPAS2LE1OS },
+    { MiscRegNum64(1, 4, 8, 4, 5), MISCREG_TLBI_IPAS2LE1 },
+    { MiscRegNum64(1, 4, 8, 4, 6), MISCREG_TLBI_RIPAS2LE1 },
+    { MiscRegNum64(1, 4, 8, 4, 7), MISCREG_TLBI_RIPAS2LE1OS },
+    { MiscRegNum64(1, 4, 8, 5, 1), MISCREG_TLBI_RVAE2OS },
+    { MiscRegNum64(1, 4, 8, 5, 5), MISCREG_TLBI_RVALE2OS },
+    { MiscRegNum64(1, 4, 8, 6, 1), MISCREG_TLBI_RVAE2 },
+    { MiscRegNum64(1, 4, 8, 6, 5), MISCREG_TLBI_RVALE2 },
     { MiscRegNum64(1, 4, 8, 7, 0), MISCREG_TLBI_ALLE2 },
-    { MiscRegNum64(1, 4, 8, 7, 1), MISCREG_TLBI_VAE2_Xt },
+    { MiscRegNum64(1, 4, 8, 7, 1), MISCREG_TLBI_VAE2 },
     { MiscRegNum64(1, 4, 8, 7, 4), MISCREG_TLBI_ALLE1 },
-    { MiscRegNum64(1, 4, 8, 7, 5), MISCREG_TLBI_VALE2_Xt },
+    { MiscRegNum64(1, 4, 8, 7, 5), MISCREG_TLBI_VALE2 },
     { MiscRegNum64(1, 4, 8, 7, 6), MISCREG_TLBI_VMALLS12E1 },
     { MiscRegNum64(1, 6, 7, 8, 0), MISCREG_AT_S1E3R_Xt },
     { MiscRegNum64(1, 6, 7, 8, 1), MISCREG_AT_S1E3W_Xt },
     { MiscRegNum64(1, 6, 8, 1, 0), MISCREG_TLBI_ALLE3OS },
-    { MiscRegNum64(1, 6, 8, 1, 1), MISCREG_TLBI_VAE3OS_Xt },
-    { MiscRegNum64(1, 6, 8, 1, 5), MISCREG_TLBI_VALE3OS_Xt },
-    { MiscRegNum64(1, 6, 8, 2, 1), MISCREG_TLBI_RVAE3IS_Xt },
-    { MiscRegNum64(1, 6, 8, 2, 5), MISCREG_TLBI_RVALE3IS_Xt },
+    { MiscRegNum64(1, 6, 8, 1, 1), MISCREG_TLBI_VAE3OS },
+    { MiscRegNum64(1, 6, 8, 1, 5), MISCREG_TLBI_VALE3OS },
+    { MiscRegNum64(1, 6, 8, 2, 1), MISCREG_TLBI_RVAE3IS },
+    { MiscRegNum64(1, 6, 8, 2, 5), MISCREG_TLBI_RVALE3IS },
     { MiscRegNum64(1, 6, 8, 3, 0), MISCREG_TLBI_ALLE3IS },
-    { MiscRegNum64(1, 6, 8, 3, 1), MISCREG_TLBI_VAE3IS_Xt },
-    { MiscRegNum64(1, 6, 8, 3, 5), MISCREG_TLBI_VALE3IS_Xt },
-    { MiscRegNum64(1, 6, 8, 5, 1), MISCREG_TLBI_RVAE3OS_Xt },
-    { MiscRegNum64(1, 6, 8, 5, 5), MISCREG_TLBI_RVALE3OS_Xt },
-    { MiscRegNum64(1, 6, 8, 6, 1), MISCREG_TLBI_RVAE3_Xt },
-    { MiscRegNum64(1, 6, 8, 6, 5), MISCREG_TLBI_RVALE3_Xt },
+    { MiscRegNum64(1, 6, 8, 3, 1), MISCREG_TLBI_VAE3IS },
+    { MiscRegNum64(1, 6, 8, 3, 5), MISCREG_TLBI_VALE3IS },
+    { MiscRegNum64(1, 6, 8, 5, 1), MISCREG_TLBI_RVAE3OS },
+    { MiscRegNum64(1, 6, 8, 5, 5), MISCREG_TLBI_RVALE3OS },
+    { MiscRegNum64(1, 6, 8, 6, 1), MISCREG_TLBI_RVAE3 },
+    { MiscRegNum64(1, 6, 8, 6, 5), MISCREG_TLBI_RVALE3 },
     { MiscRegNum64(1, 6, 8, 7, 0), MISCREG_TLBI_ALLE3 },
-    { MiscRegNum64(1, 6, 8, 7, 1), MISCREG_TLBI_VAE3_Xt },
-    { MiscRegNum64(1, 6, 8, 7, 5), MISCREG_TLBI_VALE3_Xt },
+    { MiscRegNum64(1, 6, 8, 7, 1), MISCREG_TLBI_VAE3 },
+    { MiscRegNum64(1, 6, 8, 7, 5), MISCREG_TLBI_VALE3 },
     { MiscRegNum64(2, 0, 0, 0, 2), MISCREG_OSDTRRX_EL1 },
     { MiscRegNum64(2, 0, 0, 0, 4), MISCREG_DBGBVR0_EL1 },
     { MiscRegNum64(2, 0, 0, 0, 5), MISCREG_DBGBCR0_EL1 },
@@ -1032,6 +1032,9 @@ std::unordered_map<MiscRegNum64, MiscRegIndex> miscRegNumToIdx{
     { MiscRegNum64(3, 0, 9, 14, 2), MISCREG_PMINTENCLR_EL1 },
     { MiscRegNum64(3, 0, 10, 2, 0), MISCREG_MAIR_EL1 },
     { MiscRegNum64(3, 0, 10, 3, 0), MISCREG_AMAIR_EL1 },
+    { MiscRegNum64(3, 0, 10, 4, 4), MISCREG_MPAMIDR_EL1 },
+    { MiscRegNum64(3, 0, 10, 5, 0), MISCREG_MPAM1_EL1 },
+    { MiscRegNum64(3, 0, 10, 5, 1), MISCREG_MPAM0_EL1 },
     { MiscRegNum64(3, 0, 10, 5, 3), MISCREG_MPAMSM_EL1 },
     { MiscRegNum64(3, 0, 12, 0, 0), MISCREG_VBAR_EL1 },
     { MiscRegNum64(3, 0, 12, 0, 1), MISCREG_RVBAR_EL1 },
@@ -1162,6 +1165,8 @@ std::unordered_map<MiscRegNum64, MiscRegIndex> miscRegNumToIdx{
     { MiscRegNum64(3, 4, 2, 6, 0), MISCREG_VSTTBR_EL2 },
     { MiscRegNum64(3, 4, 2, 6, 2), MISCREG_VSTCR_EL2 },
     { MiscRegNum64(3, 4, 3, 0, 0), MISCREG_DACR32_EL2 },
+    { MiscRegNum64(3, 4, 3, 1, 4), MISCREG_HDFGRTR_EL2 },
+    { MiscRegNum64(3, 4, 3, 1, 5), MISCREG_HDFGWTR_EL2 },
     { MiscRegNum64(3, 4, 4, 0, 0), MISCREG_SPSR_EL2 },
     { MiscRegNum64(3, 4, 4, 0, 1), MISCREG_ELR_EL2 },
     { MiscRegNum64(3, 4, 4, 1, 0), MISCREG_SP_EL1 },
@@ -1179,6 +1184,17 @@ std::unordered_map<MiscRegNum64, MiscRegIndex> miscRegNumToIdx{
     { MiscRegNum64(3, 4, 6, 0, 4), MISCREG_HPFAR_EL2 },
     { MiscRegNum64(3, 4, 10, 2, 0), MISCREG_MAIR_EL2 },
     { MiscRegNum64(3, 4, 10, 3, 0), MISCREG_AMAIR_EL2 },
+    { MiscRegNum64(3, 4, 10, 4, 0), MISCREG_MPAMHCR_EL2 },
+    { MiscRegNum64(3, 4, 10, 4, 1), MISCREG_MPAMVPMV_EL2 },
+    { MiscRegNum64(3, 4, 10, 5, 0), MISCREG_MPAM2_EL2 },
+    { MiscRegNum64(3, 4, 10, 6, 0), MISCREG_MPAMVPM0_EL2 },
+    { MiscRegNum64(3, 4, 10, 6, 1), MISCREG_MPAMVPM1_EL2 },
+    { MiscRegNum64(3, 4, 10, 6, 2), MISCREG_MPAMVPM2_EL2 },
+    { MiscRegNum64(3, 4, 10, 6, 3), MISCREG_MPAMVPM3_EL2 },
+    { MiscRegNum64(3, 4, 10, 6, 4), MISCREG_MPAMVPM4_EL2 },
+    { MiscRegNum64(3, 4, 10, 6, 5), MISCREG_MPAMVPM5_EL2 },
+    { MiscRegNum64(3, 4, 10, 6, 6), MISCREG_MPAMVPM6_EL2 },
+    { MiscRegNum64(3, 4, 10, 6, 7), MISCREG_MPAMVPM7_EL2 },
     { MiscRegNum64(3, 4, 12, 0, 0), MISCREG_VBAR_EL2 },
     { MiscRegNum64(3, 4, 12, 0, 1), MISCREG_RVBAR_EL2 },
     { MiscRegNum64(3, 4, 12, 1, 1), MISCREG_VDISR_EL2 },
@@ -1246,6 +1262,7 @@ std::unordered_map<MiscRegNum64, MiscRegIndex> miscRegNumToIdx{
     { MiscRegNum64(3, 5, 6, 0, 0), MISCREG_FAR_EL12 },
     { MiscRegNum64(3, 5, 10, 2, 0), MISCREG_MAIR_EL12 },
     { MiscRegNum64(3, 5, 10, 3, 0), MISCREG_AMAIR_EL12 },
+    { MiscRegNum64(3, 5, 10, 5, 0), MISCREG_MPAM1_EL12 },
     { MiscRegNum64(3, 5, 12, 0, 0), MISCREG_VBAR_EL12 },
     { MiscRegNum64(3, 5, 13, 0, 1), MISCREG_CONTEXTIDR_EL12 },
     { MiscRegNum64(3, 5, 14, 1, 0), MISCREG_CNTKCTL_EL12 },
@@ -1275,6 +1292,7 @@ std::unordered_map<MiscRegNum64, MiscRegIndex> miscRegNumToIdx{
     { MiscRegNum64(3, 6, 6, 0, 0), MISCREG_FAR_EL3 },
     { MiscRegNum64(3, 6, 10, 2, 0), MISCREG_MAIR_EL3 },
     { MiscRegNum64(3, 6, 10, 3, 0), MISCREG_AMAIR_EL3 },
+    { MiscRegNum64(3, 6, 10, 5, 0), MISCREG_MPAM3_EL3 },
     { MiscRegNum64(3, 6, 12, 0, 0), MISCREG_VBAR_EL3 },
     { MiscRegNum64(3, 6, 12, 0, 1), MISCREG_RVBAR_EL3 },
     { MiscRegNum64(3, 6, 12, 0, 2), MISCREG_RMR_EL3 },
@@ -1298,6 +1316,17 @@ fgtRegister(ThreadContext *tc)
     }
 }
 
+template <bool read>
+HDFGTR
+fgtDebugRegister(ThreadContext *tc)
+{
+    if constexpr (read) {
+        return tc->readMiscReg(MISCREG_HDFGRTR_EL2);
+    } else {
+        return tc->readMiscReg(MISCREG_HDFGWTR_EL2);
+    }
+}
+
 /**
  * Template helper for fine grained traps at EL0
  *
@@ -1355,6 +1384,24 @@ faultFgtInstEL1(const MiscRegLUTEntry &entry,
     }
 }
 
+/**
+ * Template helper for fine grained traps at EL1
+ *
+ * @tparam read: is this a read access to the register?
+ * @tparam r_bitfield: register (HFGTR) bitfield
+ */
+template<bool read, auto r_bitfield>
+Fault
+faultFgtDebugEL1(const MiscRegLUTEntry &entry,
+    ThreadContext *tc, const MiscRegOp64 &inst)
+{
+    if (fgtEnabled(tc) && fgtDebugRegister<read>(tc).*r_bitfield) {
+        return inst.generateTrap(EL2);
+    } else {
+        return NoFault;
+    }
+}
+
 /**
  * Template helper for fine grained traps at EL1
  *
@@ -1805,6 +1852,51 @@ faultDebugEL2(const MiscRegLUTEntry &entry,
     }
 }
 
+template<bool read, auto r_bitfield>
+Fault
+faultDebugWithFgtEL1(const MiscRegLUTEntry &entry,
+    ThreadContext *tc, const MiscRegOp64 &inst)
+{
+    if (auto fault = faultFgtDebugEL1<read, r_bitfield>(entry, tc, inst);
+        fault != NoFault) {
+        return fault;
+    } else {
+        return faultDebugEL1(entry, tc, inst);
+    }
+}
+
+template<bool read, auto r_bitfield>
+Fault
+faultDebugOsEL1(const MiscRegLUTEntry &entry,
+    ThreadContext *tc, const MiscRegOp64 &inst)
+{
+    const HDCR mdcr_el2 = tc->readMiscReg(MISCREG_MDCR_EL2);
+    const HDCR mdcr_el3 = tc->readMiscReg(MISCREG_MDCR_EL3);
+
+    if (auto fault = faultFgtDebugEL1<read, r_bitfield>(entry, tc, inst);
+        fault != NoFault) {
+        return fault;
+    } else if (EL2Enabled(tc) && (mdcr_el2.tde || mdcr_el2.tdosa)) {
+        return inst.generateTrap(EL2);
+    } else if (ArmSystem::haveEL(tc, EL3) && mdcr_el3.tdosa) {
+        return inst.generateTrap(EL3);
+    } else {
+        return NoFault;
+    }
+}
+
+Fault
+faultDebugOsEL2(const MiscRegLUTEntry &entry,
+    ThreadContext *tc, const MiscRegOp64 &inst)
+{
+    const HDCR mdcr_el3 = tc->readMiscReg(MISCREG_MDCR_EL3);
+    if (ArmSystem::haveEL(tc, EL3) && mdcr_el3.tdosa) {
+        return inst.generateTrap(EL3);
+    } else {
+        return NoFault;
+    }
+}
+
 Fault
 faultHcrxEL2(const MiscRegLUTEntry &entry,
     ThreadContext *tc, const MiscRegOp64 &inst)
@@ -2544,6 +2636,113 @@ faultIdst(const MiscRegLUTEntry &entry,
     }
 }
 
+Fault
+faultMpamIdrEL1(const MiscRegLUTEntry &entry,
+    ThreadContext *tc, const MiscRegOp64 &inst)
+{
+    if (HaveExt(tc, ArmExtension::FEAT_MPAM)) {
+        MPAM mpam2 = tc->readMiscReg(MISCREG_MPAM2_EL2);
+        MPAM mpam3 = tc->readMiscReg(MISCREG_MPAM3_EL3);
+        MPAMIDR mpamidr = tc->readMiscReg(MISCREG_MPAMIDR_EL1);
+        MPAMHCR mpamhcr = tc->readMiscReg(MISCREG_MPAMHCR_EL2);
+        if (ArmSystem::haveEL(tc, EL3) && mpam3.el3.trapLower) {
+            return inst.generateTrap(EL3);
+        } else if (EL2Enabled(tc) && mpamidr.hasHcr && mpamhcr.trapMpamIdrEL1) {
+            return inst.generateTrap(EL2);
+        } else if (EL2Enabled(tc) && mpamidr.hasTidr && mpam2.el2.tidr) {
+            return inst.generateTrap(EL2);
+        } else {
+            return NoFault;
+        }
+    } else {
+        return inst.undefined();
+    }
+}
+
+Fault
+faultMpam0EL1(const MiscRegLUTEntry &entry,
+    ThreadContext *tc, const MiscRegOp64 &inst)
+{
+    if (HaveExt(tc, ArmExtension::FEAT_MPAM)) {
+        MPAM mpam2 = tc->readMiscReg(MISCREG_MPAM2_EL2);
+        MPAM mpam3 = tc->readMiscReg(MISCREG_MPAM3_EL3);
+        if (ArmSystem::haveEL(tc, EL3) && mpam3.el3.trapLower) {
+            return inst.generateTrap(EL3);
+        } else if (EL2Enabled(tc) && mpam2.el2.trapMpam0EL1) {
+            return inst.generateTrap(EL2);
+        } else {
+            return NoFault;
+        }
+    } else {
+        return inst.undefined();
+    }
+}
+
+Fault
+faultMpam1EL1(const MiscRegLUTEntry &entry,
+    ThreadContext *tc, const MiscRegOp64 &inst)
+{
+    if (HaveExt(tc, ArmExtension::FEAT_MPAM)) {
+        MPAM mpam2 = tc->readMiscReg(MISCREG_MPAM2_EL2);
+        MPAM mpam3 = tc->readMiscReg(MISCREG_MPAM3_EL3);
+        if (ArmSystem::haveEL(tc, EL3) && mpam3.el3.trapLower) {
+            return inst.generateTrap(EL3);
+        } else if (EL2Enabled(tc) && mpam2.el2.trapMpam1EL1) {
+            return inst.generateTrap(EL2);
+        } else {
+            return NoFault;
+        }
+    } else {
+        return inst.undefined();
+    }
+}
+
+Fault
+faultMpamEL2(const MiscRegLUTEntry &entry,
+    ThreadContext *tc, const MiscRegOp64 &inst)
+{
+    if (HaveExt(tc, ArmExtension::FEAT_MPAM)) {
+        MPAM mpam3 = tc->readMiscReg(MISCREG_MPAM3_EL3);
+        if (ArmSystem::haveEL(tc, EL3) && mpam3.el3.trapLower) {
+            return inst.generateTrap(EL3);
+        } else {
+            return NoFault;
+        }
+    } else {
+        return inst.undefined();
+    }
+}
+
+Fault
+faultMpam12EL2(const MiscRegLUTEntry &entry,
+    ThreadContext *tc, const MiscRegOp64 &inst)
+{
+    if (ELIsInHost(tc, EL2)) {
+        return faultMpamEL2(entry, tc, inst);
+    } else {
+        return inst.undefined();
+    }
+}
+
+Fault
+faultMpamsmEL1(const MiscRegLUTEntry &entry,
+    ThreadContext *tc, const MiscRegOp64 &inst)
+{
+    if (HaveExt(tc, ArmExtension::FEAT_MPAM)) {
+        MPAM mpam2 = tc->readMiscReg(MISCREG_MPAM2_EL2);
+        MPAM mpam3 = tc->readMiscReg(MISCREG_MPAM3_EL3);
+        if (ArmSystem::haveEL(tc, EL3) && mpam3.el3.trapLower) {
+            return inst.generateTrap(EL3);
+        } else if (EL2Enabled(tc) && mpam2.el2.enMpamSm) {
+            return inst.generateTrap(EL2);
+        } else {
+            return NoFault;
+        }
+    } else {
+        return inst.undefined();
+    }
+}
+
 }
 
 MiscRegIndex
@@ -2572,14 +2771,14 @@ decodeAArch64SysReg(const MiscRegNum64 &sys_reg)
     }
 }
 
-MiscRegNum64
+std::optional<MiscRegNum64>
 encodeAArch64SysReg(MiscRegIndex misc_reg)
 {
     if (auto it = idxToMiscRegNum.find(misc_reg);
         it != idxToMiscRegNum.end()) {
         return it->second;
     } else {
-        panic("Invalid MiscRegIndex: %d\n", misc_reg);
+        return std::nullopt;
     }
 }
 
@@ -3181,17 +3380,12 @@ ISA::initializeMiscRegMetadata()
     InitReg(MISCREG_ID_ISAR5)
       .reset([p,release=release] () {
         ISAR5 isar5 = p.id_isar5;
-        if (release->has(ArmExtension::CRYPTO)) {
-            isar5.crc32 = 1;
-            isar5.sha2 = 1;
-            isar5.sha1 = 1;
-            isar5.aes = 2;
-        } else {
-            isar5.crc32 = 0;
-            isar5.sha2 = 0;
-            isar5.sha1 = 0;
-            isar5.aes = 0;
-        }
+        isar5.crc32 = release->has(ArmExtension::FEAT_CRC32) ? 0x1 : 0x0;
+        isar5.sha2 = release->has(ArmExtension::FEAT_SHA256) ? 0x1 : 0x0;
+        isar5.sha1 = release->has(ArmExtension::FEAT_SHA1) ? 0x1 : 0x0;
+        isar5.aes = release->has(ArmExtension::FEAT_PMULL) ?
+            0x2 : release->has(ArmExtension::FEAT_AES) ?
+                0x1 : 0x0;
         isar5.rdm = release->has(ArmExtension::FEAT_RDM) ? 0x1 : 0x0;
         isar5.vcma = release->has(ArmExtension::FEAT_FCMA) ? 0x1 : 0x0;
         return isar5;
@@ -3922,331 +4116,401 @@ ISA::initializeMiscRegMetadata()
       .mapsTo(MISCREG_DBGDTRRXext);
     InitReg(MISCREG_MDSCR_EL1)
       .allPrivileges().exceptUserMode()
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::mdscrEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::mdscrEL1>)
+      .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGDSCRext);
     InitReg(MISCREG_OSDTRTX_EL1)
       .allPrivileges().exceptUserMode()
       .mapsTo(MISCREG_DBGDTRTXext);
     InitReg(MISCREG_OSECCR_EL1)
       .allPrivileges().exceptUserMode()
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::oseccrEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::oseccrEL1>)
+      .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGOSECCR);
     InitReg(MISCREG_DBGBVR0_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR0, MISCREG_DBGBXVR0);
     InitReg(MISCREG_DBGBVR1_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR1, MISCREG_DBGBXVR1);
     InitReg(MISCREG_DBGBVR2_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR2, MISCREG_DBGBXVR2);
     InitReg(MISCREG_DBGBVR3_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR3, MISCREG_DBGBXVR3);
     InitReg(MISCREG_DBGBVR4_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR4, MISCREG_DBGBXVR4);
     InitReg(MISCREG_DBGBVR5_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR5, MISCREG_DBGBXVR5);
     InitReg(MISCREG_DBGBVR6_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR6, MISCREG_DBGBXVR6);
     InitReg(MISCREG_DBGBVR7_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR7, MISCREG_DBGBXVR7);
     InitReg(MISCREG_DBGBVR8_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR8, MISCREG_DBGBXVR8);
     InitReg(MISCREG_DBGBVR9_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR9, MISCREG_DBGBXVR9);
     InitReg(MISCREG_DBGBVR10_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR10, MISCREG_DBGBXVR10);
     InitReg(MISCREG_DBGBVR11_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR11, MISCREG_DBGBXVR11);
     InitReg(MISCREG_DBGBVR12_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR12, MISCREG_DBGBXVR12);
     InitReg(MISCREG_DBGBVR13_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR13, MISCREG_DBGBXVR13);
     InitReg(MISCREG_DBGBVR14_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR14, MISCREG_DBGBXVR14);
     InitReg(MISCREG_DBGBVR15_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBVR15, MISCREG_DBGBXVR15);
     InitReg(MISCREG_DBGBCR0_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR0);
     InitReg(MISCREG_DBGBCR1_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR1);
     InitReg(MISCREG_DBGBCR2_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR2);
     InitReg(MISCREG_DBGBCR3_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR3);
     InitReg(MISCREG_DBGBCR4_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR4);
     InitReg(MISCREG_DBGBCR5_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR5);
     InitReg(MISCREG_DBGBCR6_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR6);
     InitReg(MISCREG_DBGBCR7_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR7);
     InitReg(MISCREG_DBGBCR8_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR8);
     InitReg(MISCREG_DBGBCR9_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR9);
     InitReg(MISCREG_DBGBCR10_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR10);
     InitReg(MISCREG_DBGBCR11_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR11);
     InitReg(MISCREG_DBGBCR12_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR12);
     InitReg(MISCREG_DBGBCR13_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR13);
     InitReg(MISCREG_DBGBCR14_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR14);
     InitReg(MISCREG_DBGBCR15_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgbcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgbcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGBCR15);
     InitReg(MISCREG_DBGWVR0_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR0);
     InitReg(MISCREG_DBGWVR1_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR1);
     InitReg(MISCREG_DBGWVR2_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR2);
     InitReg(MISCREG_DBGWVR3_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR3);
     InitReg(MISCREG_DBGWVR4_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR4);
     InitReg(MISCREG_DBGWVR5_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR5);
     InitReg(MISCREG_DBGWVR6_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR6);
     InitReg(MISCREG_DBGWVR7_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR7);
     InitReg(MISCREG_DBGWVR8_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR8);
     InitReg(MISCREG_DBGWVR9_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR9);
     InitReg(MISCREG_DBGWVR10_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR10);
     InitReg(MISCREG_DBGWVR11_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR11);
     InitReg(MISCREG_DBGWVR12_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR12);
     InitReg(MISCREG_DBGWVR13_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR13);
     InitReg(MISCREG_DBGWVR14_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR14);
     InitReg(MISCREG_DBGWVR15_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwvrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwvrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWVR15);
     InitReg(MISCREG_DBGWCR0_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR0);
     InitReg(MISCREG_DBGWCR1_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR1);
     InitReg(MISCREG_DBGWCR2_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR2);
     InitReg(MISCREG_DBGWCR3_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR3);
     InitReg(MISCREG_DBGWCR4_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR4);
     InitReg(MISCREG_DBGWCR5_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR5);
     InitReg(MISCREG_DBGWCR6_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR6);
     InitReg(MISCREG_DBGWCR7_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR7);
     InitReg(MISCREG_DBGWCR8_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR8);
     InitReg(MISCREG_DBGWCR9_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR9);
     InitReg(MISCREG_DBGWCR10_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR10);
     InitReg(MISCREG_DBGWCR11_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR11);
     InitReg(MISCREG_DBGWCR12_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR12);
     InitReg(MISCREG_DBGWCR13_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR13);
     InitReg(MISCREG_DBGWCR14_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR14);
     InitReg(MISCREG_DBGWCR15_EL1)
       .allPrivileges().exceptUserMode()
-      .fault(EL1, faultDebugEL1)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgwcrnEL1>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgwcrnEL1>)
       .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGWCR15);
     InitReg(MISCREG_MDCCSR_EL0)
@@ -4267,27 +4531,47 @@ ISA::initializeMiscRegMetadata()
       .mapsTo(MISCREG_DBGVCR);
     InitReg(MISCREG_MDRAR_EL1)
       .allPrivileges().exceptUserMode().writes(0)
+      .faultRead(EL1, faultDebugEL1)
+      .faultRead(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGDRAR);
     InitReg(MISCREG_OSLAR_EL1)
       .allPrivileges().exceptUserMode().reads(0)
+      .faultWrite(EL1, faultDebugOsEL1<false, &HDFGTR::oslarEL1>)
+      .faultWrite(EL2, faultDebugOsEL2)
       .mapsTo(MISCREG_DBGOSLAR);
     InitReg(MISCREG_OSLSR_EL1)
       .allPrivileges().exceptUserMode().writes(0)
+      .faultRead(EL1, faultDebugOsEL1<true, &HDFGTR::oslsrEL1>)
+      .faultRead(EL2, faultDebugOsEL2)
       .mapsTo(MISCREG_DBGOSLSR);
     InitReg(MISCREG_OSDLR_EL1)
       .allPrivileges().exceptUserMode()
+      .faultRead(EL1, faultDebugOsEL1<true, &HDFGTR::osdlrEL1>)
+      .faultWrite(EL1, faultDebugOsEL1<false, &HDFGTR::osdlrEL1>)
+      .fault(EL2, faultDebugOsEL2)
       .mapsTo(MISCREG_DBGOSDLR);
     InitReg(MISCREG_DBGPRCR_EL1)
       .allPrivileges().exceptUserMode()
+      .faultRead(EL1, faultDebugOsEL1<true, &HDFGTR::dbgprcrEL1>)
+      .faultWrite(EL1, faultDebugOsEL1<false, &HDFGTR::dbgprcrEL1>)
+      .fault(EL2, faultDebugOsEL2)
       .mapsTo(MISCREG_DBGPRCR);
     InitReg(MISCREG_DBGCLAIMSET_EL1)
       .allPrivileges().exceptUserMode()
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgclaim>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgclaim>)
+      .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGCLAIMSET);
     InitReg(MISCREG_DBGCLAIMCLR_EL1)
       .allPrivileges().exceptUserMode()
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgclaim>)
+      .faultWrite(EL1, faultDebugWithFgtEL1<false, &HDFGTR::dbgclaim>)
+      .fault(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGCLAIMCLR);
     InitReg(MISCREG_DBGAUTHSTATUS_EL1)
       .allPrivileges().exceptUserMode().writes(0)
+      .faultRead(EL1, faultDebugWithFgtEL1<true, &HDFGTR::dbgauthstatusEL1>)
+      .faultRead(EL2, faultDebugEL2)
       .mapsTo(MISCREG_DBGAUTHSTATUS);
     InitReg(MISCREG_TEECR32_EL1);
     InitReg(MISCREG_TEEHBR32_EL1);
@@ -4406,10 +4690,14 @@ ISA::initializeMiscRegMetadata()
           AA64PFR0 pfr0_el1 = 0;
           pfr0_el1.el0 = 0x2;
           pfr0_el1.el1 = 0x2;
-          pfr0_el1.el2 = release->has(ArmExtension::VIRTUALIZATION) ? 0x2 : 0x0;
+          pfr0_el1.el2 = release->has(ArmExtension::VIRTUALIZATION)
+                                  ? 0x2 : 0x0;
           pfr0_el1.el3 = release->has(ArmExtension::SECURITY) ? 0x2 : 0x0;
           pfr0_el1.sve = release->has(ArmExtension::FEAT_SVE) ? 0x1 : 0x0;
           pfr0_el1.sel2 = release->has(ArmExtension::FEAT_SEL2) ? 0x1 : 0x0;
+          // See MPAM frac in MISCREG_ID_AA64PFR1_EL1. Currently supporting
+          // MPAMv0p1
+          pfr0_el1.mpam = 0x0;
           pfr0_el1.gic = FullSystem && getGICv3CPUInterface(tc) ? 0x1 : 0;
           return pfr0_el1;
       }())
@@ -4418,8 +4706,13 @@ ISA::initializeMiscRegMetadata()
       .faultRead(EL1, faultHcrEL1<&HCR::tid3>)
       .allPrivileges().writes(0);
     InitReg(MISCREG_ID_AA64PFR1_EL1)
-      .reset(release->has(ArmExtension::FEAT_SME) ?
-          0x1 << 24 : 0)
+      .reset([release=release](){
+          AA64PFR1 pfr1_el1 = 0;
+          pfr1_el1.sme = release->has(ArmExtension::FEAT_SME) ? 0x1 : 0x0;
+          pfr1_el1.mpamFrac = release->has(ArmExtension::FEAT_MPAM) ?
+              0x1 : 0x0;
+          return pfr1_el1;
+      }())
       .unserialize(0)
       .faultRead(EL0, faultIdst)
       .faultRead(EL1, faultHcrEL1<&HCR::tid3>)
@@ -4451,17 +4744,12 @@ ISA::initializeMiscRegMetadata()
     InitReg(MISCREG_ID_AA64ISAR0_EL1)
       .reset([p,release=release](){
           AA64ISAR0 isar0_el1 = p.id_aa64isar0_el1;
-          if (release->has(ArmExtension::CRYPTO)) {
-              isar0_el1.crc32 = 1;
-              isar0_el1.sha2 = 1;
-              isar0_el1.sha1 = 1;
-              isar0_el1.aes = 2;
-          } else {
-              isar0_el1.crc32 = 0;
-              isar0_el1.sha2 = 0;
-              isar0_el1.sha1 = 0;
-              isar0_el1.aes = 0;
-          }
+          isar0_el1.crc32 = release->has(ArmExtension::FEAT_CRC32) ? 0x1 : 0x0;
+          isar0_el1.sha2 = release->has(ArmExtension::FEAT_SHA256) ? 0x1 : 0x0;
+          isar0_el1.sha1 = release->has(ArmExtension::FEAT_SHA1) ? 0x1 : 0x0;
+          isar0_el1.aes = release->has(ArmExtension::FEAT_PMULL) ?
+              0x2 : release->has(ArmExtension::FEAT_AES) ?
+                  0x1 : 0x0;
           isar0_el1.dp = release->has(ArmExtension::FEAT_DOTPROD) ? 0x1 : 0x0;
           isar0_el1.atomic = release->has(ArmExtension::FEAT_LSE) ? 0x2 : 0x0;
           isar0_el1.rdm = release->has(ArmExtension::FEAT_RDM) ? 0x1 : 0x0;
@@ -5008,189 +5296,189 @@ ISA::initializeMiscRegMetadata()
     InitReg(MISCREG_TLBI_VMALLE1OS)
       .faultWrite(EL1, faultTlbiOsEL1<&HFGITR::tlbivmalle1os>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VAE1OS_Xt)
+    InitReg(MISCREG_TLBI_VAE1OS)
       .faultWrite(EL1, faultTlbiOsEL1<&HFGITR::tlbivae1os>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_ASIDE1OS_Xt)
+    InitReg(MISCREG_TLBI_ASIDE1OS)
       .faultWrite(EL1, faultTlbiOsEL1<&HFGITR::tlbiaside1os>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VAAE1OS_Xt)
+    InitReg(MISCREG_TLBI_VAAE1OS)
       .faultWrite(EL1, faultTlbiOsEL1<&HFGITR::tlbivaae1os>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VALE1OS_Xt)
+    InitReg(MISCREG_TLBI_VALE1OS)
       .faultWrite(EL1, faultTlbiOsEL1<&HFGITR::tlbivale1os>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VAALE1OS_Xt)
+    InitReg(MISCREG_TLBI_VAALE1OS)
       .faultWrite(EL1, faultTlbiOsEL1<&HFGITR::tlbivaale1os>)
       .writes(1).exceptUserMode();
     InitReg(MISCREG_TLBI_VMALLE1IS)
       .faultWrite(EL1, faultTlbiIsEL1<&HFGITR::tlbivmalle1is>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VAE1IS_Xt)
+    InitReg(MISCREG_TLBI_VAE1IS)
       .faultWrite(EL1, faultTlbiIsEL1<&HFGITR::tlbivae1is>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_ASIDE1IS_Xt)
+    InitReg(MISCREG_TLBI_ASIDE1IS)
       .faultWrite(EL1, faultTlbiIsEL1<&HFGITR::tlbiaside1is>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VAAE1IS_Xt)
+    InitReg(MISCREG_TLBI_VAAE1IS)
       .faultWrite(EL1, faultTlbiIsEL1<&HFGITR::tlbivaae1is>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VALE1IS_Xt)
+    InitReg(MISCREG_TLBI_VALE1IS)
       .faultWrite(EL1, faultTlbiIsEL1<&HFGITR::tlbivale1is>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VAALE1IS_Xt)
+    InitReg(MISCREG_TLBI_VAALE1IS)
       .faultWrite(EL1, faultTlbiIsEL1<&HFGITR::tlbivaale1is>)
       .writes(1).exceptUserMode();
     InitReg(MISCREG_TLBI_VMALLE1)
       .faultWrite(EL1, faultHcrFgtInstEL1<&HCR::ttlb, &HFGITR::tlbivmalle1>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VAE1_Xt)
+    InitReg(MISCREG_TLBI_VAE1)
       .faultWrite(EL1, faultHcrFgtInstEL1<&HCR::ttlb, &HFGITR::tlbivae1>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_ASIDE1_Xt)
+    InitReg(MISCREG_TLBI_ASIDE1)
       .faultWrite(EL1, faultHcrFgtInstEL1<&HCR::ttlb, &HFGITR::tlbiaside1>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VAAE1_Xt)
+    InitReg(MISCREG_TLBI_VAAE1)
       .faultWrite(EL1, faultHcrFgtInstEL1<&HCR::ttlb, &HFGITR::tlbivaae1>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VALE1_Xt)
+    InitReg(MISCREG_TLBI_VALE1)
       .faultWrite(EL1, faultHcrFgtInstEL1<&HCR::ttlb, &HFGITR::tlbivale1>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_VAALE1_Xt)
+    InitReg(MISCREG_TLBI_VAALE1)
       .faultWrite(EL1, faultHcrFgtInstEL1<&HCR::ttlb, &HFGITR::tlbivaale1>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_IPAS2E1OS_Xt)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_IPAS2LE1OS_Xt)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
+    InitReg(MISCREG_TLBI_IPAS2E1OS)
+      .monWrite().hypWrite();
+    InitReg(MISCREG_TLBI_IPAS2LE1OS)
+      .monWrite().hypWrite();
     InitReg(MISCREG_TLBI_ALLE2OS)
-      .monNonSecureWrite().hypWrite();
-    InitReg(MISCREG_TLBI_VAE2OS_Xt)
-      .monNonSecureWrite().hypWrite();
+      .monWrite().hypWrite();
+    InitReg(MISCREG_TLBI_VAE2OS)
+      .monWrite().hypWrite();
     InitReg(MISCREG_TLBI_ALLE1OS)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_VALE2OS_Xt)
-      .monNonSecureWrite().hypWrite();
+      .monWrite().hypWrite();
+    InitReg(MISCREG_TLBI_VALE2OS)
+      .monWrite().hypWrite();
     InitReg(MISCREG_TLBI_VMALLS12E1OS)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_IPAS2E1IS_Xt)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_IPAS2LE1IS_Xt)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
+      .monWrite().hypWrite();
+    InitReg(MISCREG_TLBI_IPAS2E1IS)
+      .monWrite().hypWrite();
+    InitReg(MISCREG_TLBI_IPAS2LE1IS)
+      .monWrite().hypWrite();
     InitReg(MISCREG_TLBI_ALLE2IS)
-      .monNonSecureWrite().hypWrite();
-    InitReg(MISCREG_TLBI_VAE2IS_Xt)
-      .monNonSecureWrite().hypWrite();
+      .monWrite().hypWrite();
+    InitReg(MISCREG_TLBI_VAE2IS)
+      .monWrite().hypWrite();
     InitReg(MISCREG_TLBI_ALLE1IS)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_VALE2IS_Xt)
-      .monNonSecureWrite().hypWrite();
+      .monWrite().hypWrite();
+    InitReg(MISCREG_TLBI_VALE2IS)
+      .monWrite().hypWrite();
     InitReg(MISCREG_TLBI_VMALLS12E1IS)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_IPAS2E1_Xt)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_IPAS2LE1_Xt)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
+      .monWrite().hypWrite();
+    InitReg(MISCREG_TLBI_IPAS2E1)
+      .monWrite().hypWrite();
+    InitReg(MISCREG_TLBI_IPAS2LE1)
+      .monWrite().hypWrite();
     InitReg(MISCREG_TLBI_ALLE2)
-      .monNonSecureWrite().hypWrite();
-    InitReg(MISCREG_TLBI_VAE2_Xt)
-      .monNonSecureWrite().hypWrite();
+      .monWrite().hypWrite();
+    InitReg(MISCREG_TLBI_VAE2)
+      .monWrite().hypWrite();
     InitReg(MISCREG_TLBI_ALLE1)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_VALE2_Xt)
-      .monNonSecureWrite().hypWrite();
+      .monWrite().hypWrite();
+    InitReg(MISCREG_TLBI_VALE2)
+      .monWrite().hypWrite();
     InitReg(MISCREG_TLBI_VMALLS12E1)
-      .hypWrite().monSecureWrite().monNonSecureWrite();
+      .monWrite().hypWrite();
     InitReg(MISCREG_TLBI_ALLE3OS)
       .monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_VAE3OS_Xt)
+    InitReg(MISCREG_TLBI_VAE3OS)
       .monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_VALE3OS_Xt)
+    InitReg(MISCREG_TLBI_VALE3OS)
       .monSecureWrite().monNonSecureWrite();
     InitReg(MISCREG_TLBI_ALLE3IS)
       .monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_VAE3IS_Xt)
+    InitReg(MISCREG_TLBI_VAE3IS)
       .monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_VALE3IS_Xt)
+    InitReg(MISCREG_TLBI_VALE3IS)
       .monSecureWrite().monNonSecureWrite();
     InitReg(MISCREG_TLBI_ALLE3)
       .monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_VAE3_Xt)
+    InitReg(MISCREG_TLBI_VAE3)
       .monSecureWrite().monNonSecureWrite();
-    InitReg(MISCREG_TLBI_VALE3_Xt)
+    InitReg(MISCREG_TLBI_VALE3)
       .monSecureWrite().monNonSecureWrite();
 
-    InitReg(MISCREG_TLBI_RVAE1_Xt)
+    InitReg(MISCREG_TLBI_RVAE1)
       .faultWrite(EL1, faultHcrFgtInstEL1<&HCR::ttlb, &HFGITR::tlbirvae1>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RVAAE1_Xt)
+    InitReg(MISCREG_TLBI_RVAAE1)
       .faultWrite(EL1, faultHcrFgtInstEL1<&HCR::ttlb, &HFGITR::tlbirvaae1>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RVALE1_Xt)
+    InitReg(MISCREG_TLBI_RVALE1)
       .faultWrite(EL1, faultHcrFgtInstEL1<&HCR::ttlb, &HFGITR::tlbirvale1>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RVAALE1_Xt)
+    InitReg(MISCREG_TLBI_RVAALE1)
       .faultWrite(EL1, faultHcrFgtInstEL1<&HCR::ttlb, &HFGITR::tlbirvaale1>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RIPAS2E1_Xt)
+    InitReg(MISCREG_TLBI_RIPAS2E1)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RIPAS2LE1_Xt)
+    InitReg(MISCREG_TLBI_RIPAS2LE1)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RVAE2_Xt)
+    InitReg(MISCREG_TLBI_RVAE2)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RVALE2_Xt)
+    InitReg(MISCREG_TLBI_RVALE2)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RVAE3_Xt)
+    InitReg(MISCREG_TLBI_RVAE3)
       .monWrite();
-    InitReg(MISCREG_TLBI_RVALE3_Xt)
+    InitReg(MISCREG_TLBI_RVALE3)
       .monWrite();
-    InitReg(MISCREG_TLBI_RVAE1IS_Xt)
+    InitReg(MISCREG_TLBI_RVAE1IS)
       .faultWrite(EL1, faultTlbiIsEL1<&HFGITR::tlbirvae1is>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RVAAE1IS_Xt)
+    InitReg(MISCREG_TLBI_RVAAE1IS)
       .faultWrite(EL1, faultTlbiIsEL1<&HFGITR::tlbirvaae1is>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RVALE1IS_Xt)
+    InitReg(MISCREG_TLBI_RVALE1IS)
       .faultWrite(EL1, faultTlbiIsEL1<&HFGITR::tlbirvale1is>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RVAALE1IS_Xt)
+    InitReg(MISCREG_TLBI_RVAALE1IS)
       .faultWrite(EL1, faultTlbiIsEL1<&HFGITR::tlbirvaale1is>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RIPAS2E1IS_Xt)
+    InitReg(MISCREG_TLBI_RIPAS2E1IS)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RIPAS2LE1IS_Xt)
+    InitReg(MISCREG_TLBI_RIPAS2LE1IS)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RVAE2IS_Xt)
+    InitReg(MISCREG_TLBI_RVAE2IS)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RVALE2IS_Xt)
+    InitReg(MISCREG_TLBI_RVALE2IS)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RVAE3IS_Xt)
+    InitReg(MISCREG_TLBI_RVAE3IS)
       .monWrite();
-    InitReg(MISCREG_TLBI_RVALE3IS_Xt)
+    InitReg(MISCREG_TLBI_RVALE3IS)
       .monWrite();
-    InitReg(MISCREG_TLBI_RVAE1OS_Xt)
+    InitReg(MISCREG_TLBI_RVAE1OS)
       .faultWrite(EL1, faultTlbiOsEL1<&HFGITR::tlbirvae1os>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RVAAE1OS_Xt)
+    InitReg(MISCREG_TLBI_RVAAE1OS)
       .faultWrite(EL1, faultTlbiOsEL1<&HFGITR::tlbirvaae1os>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RVALE1OS_Xt)
+    InitReg(MISCREG_TLBI_RVALE1OS)
       .faultWrite(EL1, faultTlbiOsEL1<&HFGITR::tlbirvale1os>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RVAALE1OS_Xt)
+    InitReg(MISCREG_TLBI_RVAALE1OS)
       .faultWrite(EL1, faultTlbiOsEL1<&HFGITR::tlbirvaale1os>)
       .writes(1).exceptUserMode();
-    InitReg(MISCREG_TLBI_RIPAS2E1OS_Xt)
+    InitReg(MISCREG_TLBI_RIPAS2E1OS)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RIPAS2LE1OS_Xt)
+    InitReg(MISCREG_TLBI_RIPAS2LE1OS)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RVAE2OS_Xt)
+    InitReg(MISCREG_TLBI_RVAE2OS)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RVALE2OS_Xt)
+    InitReg(MISCREG_TLBI_RVALE2OS)
       .hypWrite().monWrite();
-    InitReg(MISCREG_TLBI_RVAE3OS_Xt)
+    InitReg(MISCREG_TLBI_RVAE3OS)
       .monWrite();
-    InitReg(MISCREG_TLBI_RVALE3OS_Xt)
+    InitReg(MISCREG_TLBI_RVALE3OS)
       .monWrite();
     InitReg(MISCREG_PMINTENSET_EL1)
       .allPrivileges().exceptUserMode()
@@ -6158,8 +6446,6 @@ ISA::initializeMiscRegMetadata()
         .allPrivileges().exceptUserMode();
     InitReg(MISCREG_TPIDR2_EL0)
         .allPrivileges();
-    InitReg(MISCREG_MPAMSM_EL1)
-        .allPrivileges().exceptUserMode();
 
     InitReg(MISCREG_RNDR)
         .faultRead(EL0, faultRng)
@@ -6186,6 +6472,12 @@ ISA::initializeMiscRegMetadata()
     InitReg(MISCREG_HFGITR_EL2)
       .fault(EL2, faultFgtCtrlRegs)
       .hyp().mon(release->has(ArmExtension::FEAT_FGT));
+    InitReg(MISCREG_HDFGRTR_EL2)
+      .fault(EL2, faultFgtCtrlRegs)
+      .hyp().mon(release->has(ArmExtension::FEAT_FGT));
+    InitReg(MISCREG_HDFGWTR_EL2)
+      .fault(EL2, faultFgtCtrlRegs)
+      .hyp().mon(release->has(ArmExtension::FEAT_FGT));
 
     // Dummy registers
     InitReg(MISCREG_NOP)
@@ -6234,6 +6526,73 @@ ISA::initializeMiscRegMetadata()
       .warnNotFail()
       .fault(faultUnimplemented);
 
+    // MPAM extension
+    InitReg(MISCREG_MPAMIDR_EL1)
+        .reset(p.mpamidr_el1)
+        .res0(mask(63, 62) | mask(56, 40) | mask(31, 21) | mask(16, 16))
+        .faultRead(EL1, faultMpamIdrEL1)
+        .faultRead(EL2, faultMpamEL2)
+        .allPrivileges().exceptUserMode().writes(0);
+    InitReg(MISCREG_MPAM0_EL1)
+        .res0(mask(63, 48))
+        .fault(EL1, faultMpam0EL1)
+        .fault(EL2, faultMpamEL2)
+        .priv().hyp().mon();
+    InitReg(MISCREG_MPAM1_EL1)
+        .res0(mask(62, 61) | mask(59, 48))
+        .fault(EL1, faultMpam1EL1)
+        .fault(EL2, faultMpamEL2)
+        .priv().hyp().mon();
+    InitReg(MISCREG_MPAM1_EL12)
+        .res0(mask(59, 48))
+        .fault(EL2, faultMpam12EL2)
+        .fault(EL3, defaultFaultE2H_EL3)
+        .hyp().mon();
+    InitReg(MISCREG_MPAM2_EL2)
+        .res0(mask(62, 59) | mask(57, 50))
+        .fault(EL2, faultMpamEL2)
+        .hyp().mon();
+    InitReg(MISCREG_MPAMHCR_EL2)
+        .res0(mask(63, 32) | mask(30, 9) | mask(7, 2))
+        .fault(EL2, faultMpamEL2)
+        .hyp().mon();
+    InitReg(MISCREG_MPAMVPM0_EL2)
+        .fault(EL2, faultMpamEL2)
+        .hyp().mon();
+    InitReg(MISCREG_MPAMVPM1_EL2)
+        .fault(EL2, faultMpamEL2)
+        .hyp().mon();
+    InitReg(MISCREG_MPAMVPM2_EL2)
+        .fault(EL2, faultMpamEL2)
+        .hyp().mon();
+    InitReg(MISCREG_MPAMVPM3_EL2)
+        .fault(EL2, faultMpamEL2)
+        .hyp().mon();
+    InitReg(MISCREG_MPAMVPM4_EL2)
+        .fault(EL2, faultMpamEL2)
+        .hyp().mon();
+    InitReg(MISCREG_MPAMVPM5_EL2)
+        .fault(EL2, faultMpamEL2)
+        .hyp().mon();
+    InitReg(MISCREG_MPAMVPM6_EL2)
+        .fault(EL2, faultMpamEL2)
+        .hyp().mon();
+    InitReg(MISCREG_MPAMVPM7_EL2)
+        .fault(EL2, faultMpamEL2)
+        .hyp().mon();
+    InitReg(MISCREG_MPAMVPMV_EL2)
+        .res0(mask(63, 32))
+        .fault(EL2, faultMpamEL2)
+        .hyp().mon();
+    InitReg(MISCREG_MPAM3_EL3)
+        .res0(mask(59, 48))
+        .mon();
+    InitReg(MISCREG_MPAMSM_EL1)
+        .res0(mask(63, 48) | mask(39, 32) | mask(15, 0))
+        .fault(EL1, faultMpamsmEL1)
+        .fault(EL2, faultMpamEL2)
+        .allPrivileges().exceptUserMode();
+
     // Register mappings for some unimplemented registers:
     // ESR_EL1 -> DFSR
     // RMR_EL1 -> RMR
diff --git a/src/arch/arm/regs/misc.hh b/src/arch/arm/regs/misc.hh
index 065e5439c2..93005aa0e1 100644
--- a/src/arch/arm/regs/misc.hh
+++ b/src/arch/arm/regs/misc.hh
@@ -43,6 +43,7 @@
 
 #include <array>
 #include <bitset>
+#include <optional>
 #include <tuple>
 
 #include "arch/arm/regs/misc_types.hh"
@@ -690,82 +691,82 @@ namespace ArmISA
         MISCREG_AT_S1E3W_Xt,
         MISCREG_TLBI_VMALLE1IS,
         MISCREG_TLBI_VMALLE1OS,
-        MISCREG_TLBI_VAE1IS_Xt,
-        MISCREG_TLBI_VAE1OS_Xt,
-        MISCREG_TLBI_ASIDE1IS_Xt,
-        MISCREG_TLBI_ASIDE1OS_Xt,
-        MISCREG_TLBI_VAAE1IS_Xt,
-        MISCREG_TLBI_VAAE1OS_Xt,
-        MISCREG_TLBI_VALE1IS_Xt,
-        MISCREG_TLBI_VALE1OS_Xt,
-        MISCREG_TLBI_VAALE1IS_Xt,
-        MISCREG_TLBI_VAALE1OS_Xt,
+        MISCREG_TLBI_VAE1IS,
+        MISCREG_TLBI_VAE1OS,
+        MISCREG_TLBI_ASIDE1IS,
+        MISCREG_TLBI_ASIDE1OS,
+        MISCREG_TLBI_VAAE1IS,
+        MISCREG_TLBI_VAAE1OS,
+        MISCREG_TLBI_VALE1IS,
+        MISCREG_TLBI_VALE1OS,
+        MISCREG_TLBI_VAALE1IS,
+        MISCREG_TLBI_VAALE1OS,
         MISCREG_TLBI_VMALLE1,
-        MISCREG_TLBI_VAE1_Xt,
-        MISCREG_TLBI_ASIDE1_Xt,
-        MISCREG_TLBI_VAAE1_Xt,
-        MISCREG_TLBI_VALE1_Xt,
-        MISCREG_TLBI_VAALE1_Xt,
-        MISCREG_TLBI_IPAS2E1IS_Xt,
-        MISCREG_TLBI_IPAS2E1OS_Xt,
-        MISCREG_TLBI_IPAS2LE1IS_Xt,
-        MISCREG_TLBI_IPAS2LE1OS_Xt,
+        MISCREG_TLBI_VAE1,
+        MISCREG_TLBI_ASIDE1,
+        MISCREG_TLBI_VAAE1,
+        MISCREG_TLBI_VALE1,
+        MISCREG_TLBI_VAALE1,
+        MISCREG_TLBI_IPAS2E1IS,
+        MISCREG_TLBI_IPAS2E1OS,
+        MISCREG_TLBI_IPAS2LE1IS,
+        MISCREG_TLBI_IPAS2LE1OS,
         MISCREG_TLBI_ALLE2IS,
         MISCREG_TLBI_ALLE2OS,
-        MISCREG_TLBI_VAE2IS_Xt,
-        MISCREG_TLBI_VAE2OS_Xt,
+        MISCREG_TLBI_VAE2IS,
+        MISCREG_TLBI_VAE2OS,
         MISCREG_TLBI_ALLE1IS,
         MISCREG_TLBI_ALLE1OS,
-        MISCREG_TLBI_VALE2IS_Xt,
-        MISCREG_TLBI_VALE2OS_Xt,
+        MISCREG_TLBI_VALE2IS,
+        MISCREG_TLBI_VALE2OS,
         MISCREG_TLBI_VMALLS12E1IS,
         MISCREG_TLBI_VMALLS12E1OS,
-        MISCREG_TLBI_IPAS2E1_Xt,
-        MISCREG_TLBI_IPAS2LE1_Xt,
+        MISCREG_TLBI_IPAS2E1,
+        MISCREG_TLBI_IPAS2LE1,
         MISCREG_TLBI_ALLE2,
-        MISCREG_TLBI_VAE2_Xt,
+        MISCREG_TLBI_VAE2,
         MISCREG_TLBI_ALLE1,
-        MISCREG_TLBI_VALE2_Xt,
+        MISCREG_TLBI_VALE2,
         MISCREG_TLBI_VMALLS12E1,
         MISCREG_TLBI_ALLE3IS,
         MISCREG_TLBI_ALLE3OS,
-        MISCREG_TLBI_VAE3IS_Xt,
-        MISCREG_TLBI_VAE3OS_Xt,
-        MISCREG_TLBI_VALE3IS_Xt,
-        MISCREG_TLBI_VALE3OS_Xt,
+        MISCREG_TLBI_VAE3IS,
+        MISCREG_TLBI_VAE3OS,
+        MISCREG_TLBI_VALE3IS,
+        MISCREG_TLBI_VALE3OS,
         MISCREG_TLBI_ALLE3,
-        MISCREG_TLBI_VAE3_Xt,
-        MISCREG_TLBI_VALE3_Xt,
-        MISCREG_TLBI_RVAE1_Xt,
-        MISCREG_TLBI_RVAAE1_Xt,
-        MISCREG_TLBI_RVALE1_Xt,
-        MISCREG_TLBI_RVAALE1_Xt,
-        MISCREG_TLBI_RIPAS2E1_Xt,
-        MISCREG_TLBI_RIPAS2LE1_Xt,
-        MISCREG_TLBI_RVAE2_Xt,
-        MISCREG_TLBI_RVALE2_Xt,
-        MISCREG_TLBI_RVAE3_Xt,
-        MISCREG_TLBI_RVALE3_Xt,
-        MISCREG_TLBI_RVAE1IS_Xt,
-        MISCREG_TLBI_RVAAE1IS_Xt,
-        MISCREG_TLBI_RVALE1IS_Xt,
-        MISCREG_TLBI_RVAALE1IS_Xt,
-        MISCREG_TLBI_RIPAS2E1IS_Xt,
-        MISCREG_TLBI_RIPAS2LE1IS_Xt,
-        MISCREG_TLBI_RVAE2IS_Xt,
-        MISCREG_TLBI_RVALE2IS_Xt,
-        MISCREG_TLBI_RVAE3IS_Xt,
-        MISCREG_TLBI_RVALE3IS_Xt,
-        MISCREG_TLBI_RVAE1OS_Xt,
-        MISCREG_TLBI_RVAAE1OS_Xt,
-        MISCREG_TLBI_RVALE1OS_Xt,
-        MISCREG_TLBI_RVAALE1OS_Xt,
-        MISCREG_TLBI_RIPAS2E1OS_Xt,
-        MISCREG_TLBI_RIPAS2LE1OS_Xt,
-        MISCREG_TLBI_RVAE2OS_Xt,
-        MISCREG_TLBI_RVALE2OS_Xt,
-        MISCREG_TLBI_RVAE3OS_Xt,
-        MISCREG_TLBI_RVALE3OS_Xt,
+        MISCREG_TLBI_VAE3,
+        MISCREG_TLBI_VALE3,
+        MISCREG_TLBI_RVAE1,
+        MISCREG_TLBI_RVAAE1,
+        MISCREG_TLBI_RVALE1,
+        MISCREG_TLBI_RVAALE1,
+        MISCREG_TLBI_RIPAS2E1,
+        MISCREG_TLBI_RIPAS2LE1,
+        MISCREG_TLBI_RVAE2,
+        MISCREG_TLBI_RVALE2,
+        MISCREG_TLBI_RVAE3,
+        MISCREG_TLBI_RVALE3,
+        MISCREG_TLBI_RVAE1IS,
+        MISCREG_TLBI_RVAAE1IS,
+        MISCREG_TLBI_RVALE1IS,
+        MISCREG_TLBI_RVAALE1IS,
+        MISCREG_TLBI_RIPAS2E1IS,
+        MISCREG_TLBI_RIPAS2LE1IS,
+        MISCREG_TLBI_RVAE2IS,
+        MISCREG_TLBI_RVALE2IS,
+        MISCREG_TLBI_RVAE3IS,
+        MISCREG_TLBI_RVALE3IS,
+        MISCREG_TLBI_RVAE1OS,
+        MISCREG_TLBI_RVAAE1OS,
+        MISCREG_TLBI_RVALE1OS,
+        MISCREG_TLBI_RVAALE1OS,
+        MISCREG_TLBI_RIPAS2E1OS,
+        MISCREG_TLBI_RIPAS2LE1OS,
+        MISCREG_TLBI_RVAE2OS,
+        MISCREG_TLBI_RVALE2OS,
+        MISCREG_TLBI_RVAE3OS,
+        MISCREG_TLBI_RVALE3OS,
         MISCREG_PMINTENSET_EL1,
         MISCREG_PMINTENCLR_EL1,
         MISCREG_PMCR_EL0,
@@ -1138,6 +1139,26 @@ namespace ArmISA
         MISCREG_HFGITR_EL2,
         MISCREG_HFGRTR_EL2,
         MISCREG_HFGWTR_EL2,
+        MISCREG_HDFGRTR_EL2,
+        MISCREG_HDFGWTR_EL2,
+
+        // FEAT_MPAM
+        MISCREG_MPAMIDR_EL1,
+        MISCREG_MPAM0_EL1,
+        MISCREG_MPAM1_EL1,
+        MISCREG_MPAM2_EL2,
+        MISCREG_MPAM3_EL3,
+        MISCREG_MPAM1_EL12,
+        MISCREG_MPAMHCR_EL2,
+        MISCREG_MPAMVPMV_EL2,
+        MISCREG_MPAMVPM0_EL2,
+        MISCREG_MPAMVPM1_EL2,
+        MISCREG_MPAMVPM2_EL2,
+        MISCREG_MPAMVPM3_EL2,
+        MISCREG_MPAMVPM4_EL2,
+        MISCREG_MPAMVPM5_EL2,
+        MISCREG_MPAMVPM6_EL2,
+        MISCREG_MPAMVPM7_EL2,
 
         // NUM_PHYS_MISCREGS specifies the number of actual physical
         // registers, not considering the following pseudo-registers
@@ -1778,7 +1799,7 @@ namespace ArmISA
                                      unsigned crn, unsigned crm,
                                      unsigned op2);
     MiscRegIndex decodeAArch64SysReg(const MiscRegNum64 &misc_reg);
-    MiscRegNum64 encodeAArch64SysReg(MiscRegIndex misc_reg);
+    std::optional<MiscRegNum64> encodeAArch64SysReg(MiscRegIndex misc_reg);
 
     // Whether a particular AArch64 system register is -always- read only.
     bool aarch64SysRegReadOnly(MiscRegIndex miscReg);
@@ -2417,82 +2438,82 @@ namespace ArmISA
         "at_s1e3w_xt",
         "tlbi_vmalle1is",
         "tlbi_vmalle1os",
-        "tlbi_vae1is_xt",
-        "tlbi_vae1os_xt",
-        "tlbi_aside1is_xt",
-        "tlbi_aside1os_xt",
-        "tlbi_vaae1is_xt",
-        "tlbi_vaae1os_xt",
-        "tlbi_vale1is_xt",
-        "tlbi_vale1os_xt",
-        "tlbi_vaale1is_xt",
-        "tlbi_vaale1os_xt",
+        "tlbi_vae1is",
+        "tlbi_vae1os",
+        "lbi_aside1is_xt",
+        "tlbi_aside1os",
+        "tlbi_vaae1is",
+        "tlbi_vaae1os",
+        "tlbi_vale1is",
+        "tlbi_vale1os",
+        "tlbi_vaale1is",
+        "tlbi_vaale1os",
         "tlbi_vmalle1",
-        "tlbi_vae1_xt",
-        "tlbi_aside1_xt",
-        "tlbi_vaae1_xt",
-        "tlbi_vale1_xt",
-        "tlbi_vaale1_xt",
-        "tlbi_ipas2e1is_xt",
-        "tlbi_ipas2e1os_xt",
-        "tlbi_ipas2le1is_xt",
-        "tlbi_ipas2le1os_xt",
+        "tlbi_vae1",
+        "tlbi_aside1",
+        "tlbi_vaae1",
+        "tlbi_vale1",
+        "tlbi_vaale1",
+        "tlbi_ipas2e1is",
+        "tlbi_ipas2e1os",
+        "tlbi_ipas2le1is",
+        "tlbi_ipas2le1os",
         "tlbi_alle2is",
         "tlbi_alle2os",
-        "tlbi_vae2is_xt",
-        "tlbi_vae2os_xt",
+        "tlbi_vae2is",
+        "tlbi_vae2os",
         "tlbi_alle1is",
         "tlbi_alle1os",
-        "tlbi_vale2is_xt",
-        "tlbi_vale2os_xt",
+        "tlbi_vale2is",
+        "tlbi_vale2os",
         "tlbi_vmalls12e1is",
         "tlbi_vmalls12e1os",
-        "tlbi_ipas2e1_xt",
-        "tlbi_ipas2le1_xt",
+        "tlbi_ipas2e1",
+        "tlbi_ipas2le1",
         "tlbi_alle2",
-        "tlbi_vae2_xt",
+        "tlbi_vae2",
         "tlbi_alle1",
-        "tlbi_vale2_xt",
+        "tlbi_vale2",
         "tlbi_vmalls12e1",
         "tlbi_alle3is",
         "tlbi_alle3os",
-        "tlbi_vae3is_xt",
-        "tlbi_vae3os_xt",
-        "tlbi_vale3is_xt",
-        "tlbi_vale3os_xt",
+        "tlbi_vae3is",
+        "tlbi_vae3os",
+        "tlbi_vale3is",
+        "tlbi_vale3os",
         "tlbi_alle3",
-        "tlbi_vae3_xt",
-        "tlbi_vale3_xt",
-        "tlbi_rvae1_xt",
-        "tlbi_rvaae1_xt",
-        "tlbi_rvale1_xt",
-        "tlbi_rvaale1_xt",
-        "tlbi_ripas2e1_xt",
-        "tlbi_ripas2le1_xt",
-        "tlbi_rvae2_xt",
-        "tlbi_rvale2_xt",
-        "tlbi_rvae3_xt",
-        "tlbi_rvale3_xt",
-        "tlbi_rvae1is_xt",
-        "tlbi_rvaae1is_xt",
-        "tlbi_rvale1is_xt",
-        "tlbi_rvaale1is_xt",
-        "tlbi_ripas2e1is_xt",
-        "tlbi_ripas2le1is_xt",
-        "tlbi_rvae2is_xt",
-        "tlbi_rvale2is_xt",
-        "tlbi_rvae3is_xt",
-        "tlbi_rvale3is_xt",
-        "tlbi_rvae1os_xt",
-        "tlbi_rvaae1os_xt",
-        "tlbi_rvale1os_xt",
-        "tlbi_rvaale1os_xt",
-        "tlbi_ripas2e1os_xt",
-        "tlbi_ripas2le1os_xt",
-        "tlbi_rvae2os_xt",
-        "tlbi_rvale2os_xt",
-        "tlbi_rvae3os_xt",
-        "tlbi_rvale3os_xt",
+        "tlbi_vae3",
+        "tlbi_vale3",
+        "tlbi_rvae1",
+        "tlbi_rvaae1",
+        "tlbi_rvale1",
+        "tlbi_rvaale1",
+        "tlbi_ripas2e1",
+        "tlbi_ripas2le1",
+        "tlbi_rvae2",
+        "tlbi_rvale2",
+        "tlbi_rvae3",
+        "tlbi_rvale3",
+        "tlbi_rvae1is",
+        "tlbi_rvaae1is",
+        "tlbi_rvale1is",
+        "tlbi_rvaale1is",
+        "tlbi_ripas2e1is",
+        "tlbi_ripas2le1is",
+        "tlbi_rvae2is",
+        "tlbi_rvale2is",
+        "tlbi_rvae3is",
+        "tlbi_rvale3is",
+        "tlbi_rvae1os",
+        "tlbi_rvaae1os",
+        "tlbi_rvale1os",
+        "tlbi_rvaale1os",
+        "tlbi_ripas2e1os",
+        "tlbi_ripas2le1os",
+        "tlbi_rvae2os",
+        "tlbi_rvale2os",
+        "tlbi_rvae3os",
+        "tlbi_rvale3os",
         "pmintenset_el1",
         "pmintenclr_el1",
         "pmcr_el0",
@@ -2853,6 +2874,26 @@ namespace ArmISA
         "hfgitr_el2",
         "hfgrtr_el2",
         "hfgwtr_el2",
+        "hdfgrtr_el2",
+        "hdfgwtr_el2",
+
+        // FEAT_MPAM
+        "mpamidr_el1",
+        "mpam0_el1",
+        "mpam1_el1",
+        "mpam2_el2",
+        "mpam3_el3",
+        "mpam1_el12",
+        "mpamhcr_el2",
+        "mpamvpmv_el2",
+        "mpamvpm0_el2",
+        "mpamvpm1_el2",
+        "mpamvpm2_el2",
+        "mpamvpm3_el2",
+        "mpamvpm4_el2",
+        "mpamvpm5_el2",
+        "mpamvpm6_el2",
+        "mpamvpm7_el2",
 
         "num_phys_regs",
 
diff --git a/src/arch/arm/regs/misc_accessors.hh b/src/arch/arm/regs/misc_accessors.hh
new file mode 100644
index 0000000000..710f6139ab
--- /dev/null
+++ b/src/arch/arm/regs/misc_accessors.hh
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2024 Arm Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_ARM_REGS_MISC_ACCESSORS_HH__
+#define __ARCH_ARM_REGS_MISC_ACCESSORS_HH__
+
+#include "arch/arm/regs/misc_types.hh"
+#include "cpu/thread_context.hh"
+
+namespace gem5
+{
+
+namespace ArmISA
+{
+
+namespace misc_regs
+{
+
+struct FarAccessor
+{
+    using type = RegVal;
+    static const MiscRegIndex el0 = NUM_MISCREGS;
+    static const MiscRegIndex el1 = MISCREG_FAR_EL1;
+    static const MiscRegIndex el2 = MISCREG_FAR_EL2;
+    static const MiscRegIndex el3 = MISCREG_FAR_EL3;
+};
+
+struct MpamAccessor
+{
+    using type = MPAM;
+    static const MiscRegIndex el0 = MISCREG_MPAM0_EL1;
+    static const MiscRegIndex el1 = MISCREG_MPAM1_EL1;
+    static const MiscRegIndex el2 = MISCREG_MPAM2_EL2;
+    static const MiscRegIndex el3 = MISCREG_MPAM3_EL3;
+};
+
+template <typename RegAccessor>
+MiscRegIndex
+getRegVersion(ExceptionLevel el)
+{
+    switch (el) {
+      case EL0:
+        return RegAccessor::el0;
+      case EL1:
+        return RegAccessor::el1;
+      case EL2:
+        return RegAccessor::el2;
+      case EL3:
+        return RegAccessor::el3;
+      default:
+        panic("Invalid EL\n");
+    }
+}
+
+template <typename RegAccessor>
+typename RegAccessor::type
+readRegister(ThreadContext *tc, ExceptionLevel el)
+{
+    return tc->readMiscReg(getRegVersion<RegAccessor>(el));
+}
+
+template <typename RegAccessor>
+typename RegAccessor::type
+readRegisterNoEffect(ThreadContext *tc, ExceptionLevel el)
+{
+    return tc->readMiscRegNoEffect(getRegVersion<RegAccessor>(el));
+}
+
+template <typename RegAccessor>
+void
+writeRegister(ThreadContext *tc, RegVal val, ExceptionLevel el)
+{
+    tc->setMiscReg(getRegVersion<RegAccessor>(el), val);
+}
+
+} // namespace misc_regs
+} // namespace ArmISA
+} // namespace gem5
+
+#endif // __ARCH_ARM_REGS_MISC_ACCESSORS_HH__
diff --git a/src/arch/arm/regs/misc_types.hh b/src/arch/arm/regs/misc_types.hh
index 3bc06ac97c..0f3a9f558c 100644
--- a/src/arch/arm/regs/misc_types.hh
+++ b/src/arch/arm/regs/misc_types.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2023 Arm Limited
+ * Copyright (c) 2010-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -226,6 +226,11 @@ namespace ArmISA
         Bitfield<3, 0> el0;
     EndBitUnion(AA64PFR0)
 
+    BitUnion64(AA64PFR1)
+        Bitfield<27, 24> sme;
+        Bitfield<19, 16> mpamFrac;
+    EndBitUnion(AA64PFR1)
+
     BitUnion64(AA64ZFR0)
         Bitfield<59, 56> f64mm;
         Bitfield<55, 52> f32mm;
@@ -1064,11 +1069,74 @@ namespace ArmISA
         Bitfield<0> afsr0EL1;
     EndBitUnion(HFGTR)
 
+    // HDFGRTR and HDFGWTR
+    BitUnion64(HDFGTR)
+        Bitfield<11> osdlrEL1;
+        Bitfield<10> oseccrEL1;
+        Bitfield<9> oslsrEL1;
+        Bitfield<8> oslarEL1;
+        Bitfield<7> dbgprcrEL1;
+        Bitfield<6> dbgauthstatusEL1;
+        Bitfield<5> dbgclaim;
+        Bitfield<4> mdscrEL1;
+        Bitfield<3> dbgwvrnEL1;
+        Bitfield<2> dbgwcrnEL1;
+        Bitfield<1> dbgbvrnEL1;
+        Bitfield<0> dbgbcrnEL1;
+    EndBitUnion(HDFGTR)
+
     BitUnion64(HCRX)
         Bitfield<15> sctlr2En;
         Bitfield<14> tcr2En;
     EndBitUnion(HCRX)
 
+    BitUnion64(MPAMIDR)
+        Bitfield<61> hasSdeflt;
+        Bitfield<60> hasForceNs;
+        Bitfield<58> hasTidr;
+        Bitfield<39,32> pmgMax;
+        Bitfield<20,18> vpmrMax;
+        Bitfield<17> hasHcr;
+        Bitfield<15,0> partidMax;
+    EndBitUnion(MPAMIDR)
+
+    // Generic view of MPAMx_ELy
+    BitUnion64(MPAM)
+        Bitfield<63> mpamEn;
+
+        // MPAM1_EL1 only
+        SubBitUnion(el1, 62, 48)
+            Bitfield<60> forcedNs;
+        EndSubBitUnion(el1)
+
+        // MPAM2_EL2 only
+        SubBitUnion(el2, 62, 48)
+            Bitfield<58> tidr;
+            Bitfield<50> enMpamSm;
+            Bitfield<49> trapMpam0EL1;
+            Bitfield<48> trapMpam1EL1;
+        EndSubBitUnion(el2)
+
+        // MPAM3_EL3 only
+        SubBitUnion(el3, 62, 48)
+            Bitfield<62> trapLower;
+            Bitfield<61> sdeflt;
+            Bitfield<60> forceNs;
+        EndSubBitUnion(el3)
+
+        Bitfield<47,40> pmgD;
+        Bitfield<39,32> pmgI;
+        Bitfield<31,16> partidD;
+        Bitfield<15,0>  partidI;
+    EndBitUnion(MPAM)
+
+    BitUnion64(MPAMHCR)
+        Bitfield<31> trapMpamIdrEL1;
+        Bitfield<8> gstappPlk;
+        Bitfield<1> el1Vpmen;
+        Bitfield<0> el0Vpmen;
+    EndBitUnion(MPAMHCR)
+
 } // namespace ArmISA
 } // namespace gem5
 
diff --git a/src/arch/arm/semihosting.cc b/src/arch/arm/semihosting.cc
index 4ce52e8741..4517e5c82c 100644
--- a/src/arch/arm/semihosting.cc
+++ b/src/arch/arm/semihosting.cc
@@ -43,7 +43,6 @@
 #include <cstdio>
 
 #include "arch/arm/utility.hh"
-#include "base/logging.hh"
 #include "base/output.hh"
 #include "base/time.hh"
 #include "debug/Semihosting.hh"
@@ -55,7 +54,6 @@
 #include "sim/byteswap.hh"
 #include "sim/full_system.hh"
 #include "sim/pseudo_inst.hh"
-#include "sim/sim_exit.hh"
 #include "sim/system.hh"
 
 namespace gem5
@@ -98,73 +96,8 @@ const std::map<uint32_t, ArmSemihosting::SemiCall> ArmSemihosting::calls{
                                 &ArmSemihosting::callGem5PseudoOp64 } },
 };
 
-const std::vector<const char *> ArmSemihosting::fmodes{
-    "r", "rb", "r+", "r+b",
-    "w", "wb", "w+", "w+b",
-    "a", "ab", "a+", "a+b",
-};
-
-const std::map<uint64_t, const char *> ArmSemihosting::exitCodes{
-    { 0x20000, "semi:ADP_Stopped_BranchThroughZero" },
-    { 0x20001, "semi:ADP_Stopped_UndefinedInstr" },
-    { 0x20002, "semi:ADP_Stopped_SoftwareInterrupt" },
-    { 0x20003, "semi:ADP_Stopped_PrefetchAbort" },
-    { 0x20004, "semi:ADP_Stopped_DataAbort" },
-    { 0x20005, "semi:ADP_Stopped_AddressException" },
-    { 0x20006, "semi:ADP_Stopped_IRQ" },
-    { 0x20007, "semi:ADP_Stopped_FIQ" },
-
-    { 0x20020, "semi:ADP_Stopped_BreakPoint" },
-    { 0x20021, "semi:ADP_Stopped_WatchPoint" },
-    { 0x20022, "semi:ADP_Stopped_StepComplete" },
-    { 0x20023, "semi:ADP_Stopped_RunTimeErrorUnknown" },
-    { 0x20024, "semi:ADP_Stopped_InternalError" },
-    { 0x20025, "semi:ADP_Stopped_UserInterruption" },
-    { 0x20026, "semi:ADP_Stopped_ApplicationExit" },
-    { 0x20027, "semi:ADP_Stopped_StackOverflow" },
-    { 0x20028, "semi:ADP_Stopped_DivisionByZero" },
-    { 0x20029, "semi:ADP_Stopped_DivisionByZero" },
-};
-
-
-const std::vector<uint8_t> ArmSemihosting::features{
-    0x53, 0x48, 0x46, 0x42, // Magic
-    0x3,                    // EXT_EXIT_EXTENDED, EXT_STDOUT_STDERR
-};
-
-const std::map<const std::string, FILE *> ArmSemihosting::stdioMap{
-    {"cin",    ::stdin},
-    {"stdin",  ::stdin},
-    {"cout",   ::stdout},
-    {"stdout", ::stdout},
-    {"cerr",   ::stderr},
-    {"stderr", ::stderr},
-};
-
 ArmSemihosting::ArmSemihosting(const ArmSemihostingParams &p)
-    : SimObject(p),
-      cmdLine(p.cmd_line),
-      memReserve(p.mem_reserve),
-      stackSize(p.stack_size),
-      timeBase([p]{ struct tm t = p.time; return mkutctime(&t); }()),
-      tickShift(calcTickShift()),
-      semiErrno(0),
-      filesRootDir(!p.files_root_dir.empty() &&
-                   p.files_root_dir.back() != '/' ?
-                   p.files_root_dir + '/' : p.files_root_dir),
-      stdin(getSTDIO("stdin", p.stdin, "r")),
-      stdout(getSTDIO("stdout", p.stdout, "w")),
-      stderr(p.stderr == p.stdout ?
-             stdout : getSTDIO("stderr", p.stderr, "w"))
-{
-    // Create an empty place-holder file for position 0 as semi-hosting
-    // calls typically expect non-zero file handles.
-    files.push_back(nullptr);
-
-    if (tickShift > 0)
-        inform("Semihosting: Shifting elapsed ticks by %i bits.",
-               tickShift);
-}
+    : BaseSemihosting(p) {}
 
 bool
 ArmSemihosting::call64(ThreadContext *tc, bool gem5_ops)
@@ -218,35 +151,8 @@ ArmSemihosting::call32(ThreadContext *tc, bool gem5_ops)
     return true;
 }
 
-void
-ArmSemihosting::serialize(CheckpointOut &cp) const
-{
-    SERIALIZE_SCALAR(semiErrno);
-
-    paramOut(cp, "num_files", files.size());
-    for (int i = 0; i < files.size(); i++) {
-        // File closed?
-        if (!files[i])
-            continue;
-
-        files[i]->serializeSection(cp, csprintf("file%i", i));
-    }
-}
-
-void
-ArmSemihosting::unserialize(CheckpointIn &cp)
-{
-    UNSERIALIZE_SCALAR(semiErrno);
-
-    size_t num_files;
-    paramIn(cp, "num_files", num_files);
-    files.resize(num_files);
-    for (int i = 0; i < num_files; i++)
-        files[i] = FileBase::create(*this, cp, csprintf("file%i", i));
-}
-
 PortProxy &
-ArmSemihosting::portProxy(ThreadContext *tc)
+ArmSemihosting::portProxyImpl(ThreadContext *tc)
 {
     static std::unique_ptr<PortProxy> port_proxy_s;
     static std::unique_ptr<PortProxy> port_proxy_ns;
@@ -280,414 +186,6 @@ ArmSemihosting::portProxy(ThreadContext *tc)
     }
 }
 
-
-std::string
-ArmSemihosting::readString(ThreadContext *tc, Addr ptr, size_t len)
-{
-    std::vector<char> buf(len + 1);
-
-    buf[len] = '\0';
-    portProxy(tc).readBlob(ptr, buf.data(), len);
-
-    return std::string(buf.data());
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callOpen(ThreadContext *tc, const Addr name_base,
-                         int fmode, size_t name_size)
-{
-    const char *mode = fmode < fmodes.size() ? fmodes[fmode] : nullptr;
-
-    DPRINTF(Semihosting, "Semihosting SYS_OPEN(0x%x, %i[%s], %i)\n",
-            name_base, fmode, mode ? mode : "-", name_size);
-    if (!mode || !name_base)
-        return retError(EINVAL);
-
-    std::string fname = readString(tc, name_base, name_size);
-    if (!fname.empty() && fname.front() != '/')
-        fname = filesRootDir + fname;
-
-    std::unique_ptr<ArmSemihosting::FileBase> file =
-        FileBase::create(*this, fname, mode);
-    int64_t ret = file->open();
-    DPRINTF(Semihosting, "Semihosting SYS_OPEN(\"%s\", %i[%s]): %i\n",
-            fname, fmode, mode, ret);
-    if (ret < 0) {
-        return retError(-ret);
-    } else {
-        files.push_back(std::move(file));
-        return retOK(files.size() - 1);
-    }
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callClose(ThreadContext *tc, Handle handle)
-{
-    if (handle > files.size()) {
-        DPRINTF(Semihosting, "Semihosting SYS_CLOSE(%i): Illegal file\n");
-        return retError(EBADF);
-    }
-
-    std::unique_ptr<FileBase> &file = files[handle];
-    int64_t error = file->close();
-    DPRINTF(Semihosting, "Semihosting SYS_CLOSE(%i[%s]): %i\n",
-            handle, file->fileName(), error);
-    if (error < 0) {
-        return retError(-error);
-    } else {
-        // Zap the pointer and free the entry in the file table as
-        // well.
-        files[handle].reset();
-        return retOK(0);
-    }
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callWriteC(ThreadContext *tc, InPlaceArg arg)
-{
-    const char c = portProxy(tc).read<char>(arg.addr);
-
-    DPRINTF(Semihosting, "Semihosting SYS_WRITEC('%c')\n", c);
-    std::cout.put(c);
-
-    return retOK(0);
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callWrite0(ThreadContext *tc, InPlaceArg arg)
-{
-    DPRINTF(Semihosting, "Semihosting SYS_WRITE0(...)\n");
-    PortProxy &proxy = portProxy(tc);
-    std::string str;
-    proxy.readString(str, arg.addr);
-    std::cout.write(str.c_str(), str.size());
-
-    return retOK(0);
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callWrite(ThreadContext *tc, Handle handle, Addr addr,
-                          size_t size)
-{
-    if (handle > files.size() || !files[handle])
-        return RetErrno(size, EBADF);
-
-    std::vector<uint8_t> buffer(size);
-    portProxy(tc).readBlob(addr, buffer.data(), buffer.size());
-
-    int64_t ret = files[handle]->write(buffer.data(), buffer.size());
-    if (ret < 0) {
-        // No bytes written (we're returning the number of bytes not
-        // written)
-        return RetErrno(size, -ret);
-    } else {
-        // Return the number of bytes not written
-        return RetErrno(size - ret, 0);
-    }
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callRead(ThreadContext *tc, Handle handle, Addr addr,
-                         size_t size)
-{
-    if (handle > files.size() || !files[handle])
-        return RetErrno(size, EBADF);
-
-    std::vector<uint8_t> buffer(size);
-    int64_t ret = files[handle]->read(buffer.data(), buffer.size());
-    if (ret < 0) {
-        return RetErrno(size, -ret);
-    } else {
-        panic_if(ret > buffer.size(), "Read longer than buffer size.");
-
-        portProxy(tc).writeBlob(addr, buffer.data(), ret);
-
-        // Return the number of bytes not written
-        return retOK(size - ret);
-    }
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callReadC(ThreadContext *tc)
-{
-    return retOK((char)std::cin.get());
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callIsError(ThreadContext *tc, int64_t status)
-{
-    return retOK(status < 0 ? 1 : 0);
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callIsTTY(ThreadContext *tc, Handle handle)
-{
-    if (handle > files.size() || !files[handle])
-        return retError(EBADF);
-
-    int64_t ret = files[handle]->isTTY();
-    if (ret < 0) {
-        return retError(-ret);
-    } else {
-        return retOK(ret ? 1 : 0);
-    }
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callSeek(ThreadContext *tc, Handle handle, uint64_t pos)
-{
-    if (handle > files.size() || !files[handle])
-        return retError(EBADF);
-
-    int64_t ret = files[handle]->seek(pos);
-    if (ret < 0) {
-        return retError(-ret);
-    } else {
-        return retOK(0);
-    }
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callFLen(ThreadContext *tc, Handle handle)
-{
-    if (handle > files.size() || !files[handle])
-        return retError(EBADF);
-
-    int64_t ret = files[handle]->flen();
-    if (ret < 0) {
-        return retError(-ret);
-    } else {
-        return retOK(ret);
-    }
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callTmpNam(ThreadContext *tc, Addr addr, uint64_t id,
-                           size_t size)
-{
-    std::string path = "";
-    int64_t unlink_call_ret = 0;
-
-    do {
-        path = simout.resolve(csprintf("%s.tmp%05i", name(), tmpNameIndex++));
-        // remove the (potentially existing) file of the given path
-        unlink_call_ret = unlink(path.c_str());
-    // if the file is busy, find another name
-    } while ((unlink_call_ret < 0) && (errno == EBUSY));
-
-    const size_t path_len = path.length();
-    if (path_len >= size)
-        return retError(ENOSPC);
-
-    portProxy(tc).writeBlob(addr, path.c_str(), path_len + 1);
-    return retOK(0);
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callRemove(ThreadContext *tc, Addr name_base, size_t name_size)
-{
-    std::string fname = readString(tc, name_base, name_size);
-
-    if (remove(fname.c_str()) != 0) {
-        return retError(errno);
-    } else {
-        return retOK(0);
-    }
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callRename(ThreadContext *tc, Addr from_addr, size_t from_size,
-                           Addr to_addr, size_t to_size)
-{
-    std::string from = readString(tc, from_addr, from_size);
-    std::string to = readString(tc, to_addr, to_size);
-
-    if (rename(from.c_str(), to.c_str()) != 0) {
-        return retError(errno);
-    } else {
-        return retOK(0);
-    }
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callClock(ThreadContext *tc)
-{
-    return retOK(curTick() / (sim_clock::as_int::s / 100));
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callTime(ThreadContext *tc)
-{
-    return retOK(timeBase + round(curTick() / sim_clock::as_float::s));
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callSystem(ThreadContext *tc, Addr cmd_addr, size_t cmd_size)
-{
-    const std::string cmd = readString(tc, cmd_addr, cmd_size);
-    warn("Semihosting: SYS_SYSTEM not implemented. Guest tried to run: %s\n",
-         cmd);
-    return retError(EINVAL);
-
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callErrno(ThreadContext *tc)
-{
-    // Preserve errno by returning it in errno as well.
-    return RetErrno(semiErrno, semiErrno);
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callGetCmdLine(ThreadContext *tc, Addr addr,
-                               InPlaceArg size_arg)
-{
-    PortProxy &proxy = portProxy(tc);
-    ByteOrder endian = ArmISA::byteOrder(tc);
-    size_t size = size_arg.read(tc, endian);
-
-    if (cmdLine.size() + 1 < size) {
-        proxy.writeBlob(addr, cmdLine.c_str(), cmdLine.size() + 1);
-        size_arg.write(tc, cmdLine.size(), endian);
-        return retOK(0);
-    } else {
-        return retError(0);
-    }
-}
-
-void
-ArmSemihosting::gatherHeapInfo(ThreadContext *tc, bool aarch64,
-                               Addr &heap_base, Addr &heap_limit,
-                               Addr &stack_base, Addr &stack_limit)
-{
-    const memory::PhysicalMemory &phys = tc->getSystemPtr()->getPhysMem();
-    const AddrRangeList memories = phys.getConfAddrRanges();
-    fatal_if(memories.size() < 1, "No memories reported from System");
-    warn_if(memories.size() > 1, "Multiple physical memory ranges available. "
-            "Using first range heap/stack.");
-    const AddrRange mem = *memories.begin();
-    const Addr mem_start = mem.start() + memReserve;
-    Addr mem_end = mem.end();
-
-    // Make sure that 32-bit guests can access their memory.
-    if (!aarch64) {
-        const Addr phys_max = (1ULL << 32) - 1;
-        panic_if(mem_start > phys_max,
-                 "Physical memory out of range for a 32-bit guest.");
-        if (mem_end > phys_max) {
-            warn("Some physical memory out of range for a 32-bit guest.");
-            mem_end = phys_max;
-        }
-    }
-
-    fatal_if(mem_start + stackSize >= mem_end,
-             "Physical memory too small to fit desired stack and a heap.");
-
-    heap_base = mem_start;
-    heap_limit = mem_end - stackSize + 1;
-    stack_base = (mem_end + 1) & ~0x7ULL; // 8 byte stack alignment
-    stack_limit = heap_limit;
-
-    inform("Reporting heap/stack info to guest:\n"
-           "\tHeap base: 0x%x\n"
-           "\tHeap limit: 0x%x\n"
-           "\tStack base: 0x%x\n"
-           "\tStack limit: 0x%x\n",
-           heap_base, heap_limit, stack_base, stack_limit);
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callHeapInfo32(ThreadContext *tc, Addr block_addr)
-{
-    uint64_t heap_base, heap_limit, stack_base, stack_limit;
-    gatherHeapInfo(tc, false, heap_base, heap_limit, stack_base, stack_limit);
-
-    std::array<uint32_t, 4> block = {{
-        (uint32_t)heap_base, (uint32_t)heap_limit,
-        (uint32_t)stack_base, (uint32_t)stack_limit
-    }};
-    portProxy(tc).write(block_addr, block, ArmISA::byteOrder(tc));
-
-    return retOK(0);
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callHeapInfo64(ThreadContext *tc, Addr block_addr)
-{
-    uint64_t heap_base, heap_limit, stack_base, stack_limit;
-    gatherHeapInfo(tc, true, heap_base, heap_limit, stack_base, stack_limit);
-
-    std::array<uint64_t, 4> block = {{
-        heap_base, heap_limit, stack_base, stack_limit
-    }};
-    portProxy(tc).write(block_addr, block, ArmISA::byteOrder(tc));
-
-    return retOK(0);
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callExit32(ThreadContext *tc, InPlaceArg code)
-{
-    semiExit(code.addr, 0);
-    return retOK(0);
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callExit64(ThreadContext *tc, uint64_t code, uint64_t subcode)
-{
-    semiExit(code, subcode);
-    return retOK(0);
-}
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callExitExtended(ThreadContext *tc,
-                                 uint64_t code, uint64_t subcode)
-{
-    semiExit(code, subcode);
-    return retOK(0);
-}
-
-void
-ArmSemihosting::semiExit(uint64_t code, uint64_t subcode)
-{
-    auto it = exitCodes.find(code);
-    if (it != exitCodes.end()) {
-        exitSimLoop(it->second, subcode);
-    } else {
-        exitSimLoop(csprintf("semi:0x%x", code), subcode);
-    }
-}
-
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callElapsed32(ThreadContext *tc, InPlaceArg low,
-                              InPlaceArg high)
-{
-    ByteOrder endian = ArmISA::byteOrder(tc);
-    uint64_t tick = semiTick(curTick());
-
-    low.write(tc, tick, endian);
-    high.write(tc, tick >> 32, endian);
-
-    return retOK(0);
-}
-
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callElapsed64(ThreadContext *tc, InPlaceArg ticks)
-{
-    ticks.write(tc, semiTick(curTick()), ArmISA::byteOrder(tc));
-    return retOK(0);
-}
-
-
-ArmSemihosting::RetErrno
-ArmSemihosting::callTickFreq(ThreadContext *tc)
-{
-    return retOK(semiTick(sim_clock::Frequency));
-}
-
-
 struct SemiPseudoAbi32 : public ArmSemihosting::Abi32
 {
     class State : public ArmSemihosting::Abi32::State
@@ -756,304 +254,4 @@ ArmSemihosting::callGem5PseudoOp64(ThreadContext *tc, uint64_t encoded_func)
         return retError(EINVAL);
 }
 
-FILE *
-ArmSemihosting::getSTDIO(const char *stream_name,
-                         const std::string &name, const char *mode)
-{
-    auto it = stdioMap.find(name);
-    if (it == stdioMap.end()) {
-        FILE *f = fopen(name.c_str(), mode);
-        if (!f) {
-            fatal("Failed to open %s (%s): %s\n",
-                  stream_name, name, strerror(errno));
-        }
-        return f;
-    } else {
-        return it->second;
-    }
-}
-
-std::unique_ptr<ArmSemihosting::FileBase>
-ArmSemihosting::FileBase::create(
-    ArmSemihosting &parent, const std::string &fname, const char *mode)
-{
-    std::unique_ptr<FileBase> file;
-    if (fname == ":semihosting-features") {
-        file.reset(new FileFeatures(parent, fname.c_str(), mode));
-    } else {
-        file.reset(new File(parent, fname.c_str(), mode));
-    }
-
-    return file;
-}
-
-std::unique_ptr<ArmSemihosting::FileBase>
-ArmSemihosting::FileBase::create(ArmSemihosting &parent,
-                                 CheckpointIn &cp, const std::string &sec)
-{
-    std::unique_ptr<FileBase> file;
-    ScopedCheckpointSection _sec(cp, sec);
-
-    // Was the file open when the checkpoint was created?
-    if (!cp.sectionExists(Serializable::currentSection()))
-        return file;
-
-    std::string fname, mode;
-    paramIn(cp, "name", fname);
-    paramIn(cp, "mode", mode);
-    file = create(parent, fname, mode.c_str());
-    assert(file);
-    file->unserialize(cp);
-
-    return file;
-}
-
-void
-ArmSemihosting::FileBase::serialize(CheckpointOut &cp) const
-{
-    paramOut(cp, "name", _name);
-    SERIALIZE_SCALAR(mode);
-}
-
-void
-ArmSemihosting::FileBase::unserialize(CheckpointIn &cp)
-{
-    /* Unserialization of name and mode happens in
-     * ArmSemihosting::FileBase::create() */
-}
-
-int64_t
-ArmSemihosting::FileBase::read(uint8_t *buffer, uint64_t size)
-{
-    return -EINVAL;
-}
-
-int64_t
-ArmSemihosting::FileBase::write(const uint8_t *buffer, uint64_t size)
-{
-    return -EINVAL;
-}
-
-int64_t
-ArmSemihosting::FileBase::seek(uint64_t pos)
-{
-    return -EINVAL;
-}
-
-int64_t
-ArmSemihosting::FileBase::flen()
-{
-    return -EINVAL;
-}
-
-
-ArmSemihosting::FileFeatures::FileFeatures(
-    ArmSemihosting &_parent, const char *_name, const char *_mode)
-    : FileBase(_parent, _name, _mode)
-{
-}
-
-int64_t
-ArmSemihosting::FileFeatures::read(uint8_t *buffer, uint64_t size)
-{
-    int64_t len = 0;
-
-    for (; pos < size && pos < ArmSemihosting::features.size(); pos++)
-        buffer[len++] = ArmSemihosting::features[pos];
-
-    return len;
-}
-
-int64_t
-ArmSemihosting::FileFeatures::seek(uint64_t _pos)
-{
-    if (_pos < ArmSemihosting::features.size()) {
-        pos = _pos;
-        return 0;
-    } else {
-        return -ENXIO;
-    }
-}
-
-void
-ArmSemihosting::FileFeatures::serialize(CheckpointOut &cp) const
-{
-    FileBase::serialize(cp);
-    SERIALIZE_SCALAR(pos);
-}
-
-void
-ArmSemihosting::FileFeatures::unserialize(CheckpointIn &cp)
-{
-    FileBase::unserialize(cp);
-    UNSERIALIZE_SCALAR(pos);
-}
-
-
-
-ArmSemihosting::File::File(ArmSemihosting &_parent,
-                           const char *_name, const char *_perms)
-    : FileBase(_parent, _name, _perms),
-      file(nullptr)
-{
-}
-
-ArmSemihosting::File::~File()
-{
-    if (file)
-        close();
-}
-
-int64_t
-ArmSemihosting::File::openImpl(bool in_cpt)
-{
-    panic_if(file, "Trying to open an already open file.\n");
-
-    if (_name == ":tt") {
-        if (mode[0] == 'r') {
-            file = parent.stdin;
-        } else if (mode[0] == 'w') {
-            file = parent.stdout;
-        } else if (mode[0] == 'a') {
-            file = parent.stderr;
-        } else {
-            warn("Unknown file mode for the ':tt' special file");
-            return -EINVAL;
-        }
-    } else {
-        std::string real_mode(this->mode);
-        // Avoid truncating the file if we are restoring from a
-        // checkpoint.
-        if (in_cpt && real_mode[0] == 'w')
-            real_mode[0] = 'a';
-
-        file = fopen(_name.c_str(), real_mode.c_str());
-    }
-
-    return file ? 0 : -errno;
-}
-
-int64_t
-ArmSemihosting::File::close()
-{
-    panic_if(!file, "Trying to close an already closed file.\n");
-
-    if (needClose()) {
-        fclose(file);
-    }
-    file = nullptr;
-
-    return 0;
-}
-
-bool
-ArmSemihosting::File::isTTY() const
-{
-    return file == parent.stdout ||
-        file == parent.stderr ||
-        file == parent.stdin;
-}
-
-int64_t
-ArmSemihosting::File::read(uint8_t *buffer, uint64_t size)
-{
-    panic_if(!file, "Trying to read from a closed file");
-
-    size_t ret = fread(buffer, 1, size, file);
-    if (ret == 0) {
-        // Error or EOF. Assume errors are due to invalid file
-        // operations (e.g., reading a write-only stream).
-        return ferror(file) ? -EINVAL : 0;
-    } else {
-        return ret;
-    }
-}
-
-int64_t
-ArmSemihosting::File::write(const uint8_t *buffer, uint64_t size)
-{
-    panic_if(!file, "Trying to write to a closed file");
-
-
-    size_t ret = fwrite(buffer, 1, size, file);
-    if (ret == 0) {
-        // Assume errors are due to invalid file operations (e.g.,
-        // writing a read-only stream).
-        return -EINVAL;
-    } else {
-        return ret;
-    }
-}
-
-int64_t
-ArmSemihosting::File::seek(uint64_t _pos)
-{
-    panic_if(!file, "Trying to seek in a closed file");
-
-    errno = 0;
-    if (fseek(file, _pos, SEEK_SET) == 0)
-        return 0;
-    else
-        return -errno;
-}
-
-int64_t
-ArmSemihosting::File::flen()
-{
-    errno = 0;
-    long pos = ftell(file);
-    if (pos < 0)
-        return -errno;
-
-    if (fseek(file, 0, SEEK_END) != 0)
-        return -errno;
-
-    long len = ftell(file);
-    if (len < 0)
-        return -errno;
-
-    if (fseek(file, pos, SEEK_SET) != 0)
-        return -errno;
-
-    return len;
-}
-
-
-void
-ArmSemihosting::File::serialize(CheckpointOut &cp) const
-{
-    FileBase::serialize(cp);
-
-    if (!isTTY()) {
-        long pos = file ? ftell(file) : 0;
-        panic_if(pos < 0, "Failed to get file position.");
-        SERIALIZE_SCALAR(pos);
-    }
-}
-
-void
-ArmSemihosting::File::unserialize(CheckpointIn &cp)
-{
-    FileBase::unserialize(cp);
-
-    if (openImpl(true) < 0) {
-        fatal("Failed to open file: %s", _name);
-    }
-
-    if (!isTTY()) {
-        long pos = 0;
-        UNSERIALIZE_SCALAR(pos);
-        if (fseek(file, pos, SEEK_SET) != 0) {
-            fatal("Failed seek to current position (%i) in '%s'", pos, _name);
-        }
-    }
-}
-
-std::ostream &
-operator << (std::ostream &os, const ArmSemihosting::InPlaceArg &ipa)
-{
-    ccprintf(os, "[%#x-%#x)", ipa.addr, ipa.addr + ipa.size - 1);
-    return os;
-}
-
 } // namespace gem5
diff --git a/src/arch/arm/semihosting.hh b/src/arch/arm/semihosting.hh
index 557eb76636..d01b4e4e99 100644
--- a/src/arch/arm/semihosting.hh
+++ b/src/arch/arm/semihosting.hh
@@ -47,10 +47,12 @@
 
 #include "arch/arm/regs/int.hh"
 #include "arch/arm/utility.hh"
+#include "arch/generic/semihosting.hh"
 #include "cpu/thread_context.hh"
 #include "mem/port_proxy.hh"
 #include "sim/core.hh"
 #include "sim/guest_abi.hh"
+#include "sim/pseudo_inst.hh"
 #include "sim/sim_object.hh"
 
 namespace gem5
@@ -59,21 +61,8 @@ namespace gem5
 struct ArmSemihostingParams;
 class SerialDevice;
 
-/**
- * Semihosting for AArch32 and AArch64
- *
- * This class implements the Arm semihosting interface. This interface
- * allows baremetal code access service, such as IO, from the
- * simulator. It is conceptually a simplified version of gem5's more
- * general syscall emulation mode.
- *
- * Exits calls (SYS_EXIT, SYS_EXIT_EXTENDED) from the guest get
- * translated into simualtion exits. Well-known exit codes are
- * translated to messages on the form 'semi:ADP_.*' while unknown
- * codes are returned in hex ('semi:0x..'). The subcode is reported in
- * the gem5 exit event.
- */
-class ArmSemihosting : public SimObject
+/** Semihosting for AArch32 and AArch64. */
+class ArmSemihosting : public BaseSemihosting
 {
   public:
     enum
@@ -88,63 +77,28 @@ class ArmSemihosting : public SimObject
         Gem5Imm = 0x5D57
     };
 
-    static PortProxy &portProxy(ThreadContext *tc);
-
-    struct AbiBase
+    static PortProxy &portProxyImpl(ThreadContext *tc);
+    PortProxy &portProxy(ThreadContext *tc) const override
     {
-        template <typename Arg>
-        class StateBase
-        {
-          private:
-            Addr argPointer;
-            ByteOrder endian;
-
-          public:
-            StateBase(const ThreadContext *tc, Addr arg_pointer) :
-                argPointer(arg_pointer), endian(ArmISA::byteOrder(tc))
-            {}
-
-            /*
-             * These two methods are used to both read an argument or its
-             * address, and to move position on to the next location. Normally
-             * State would be more passive, but since it behaves almost the
-             * same no matter what the argument type is we can simplify and
-             * consolidate a little bit by centralizing these methods.
-             */
-
-            // Return the address of an argument slot and move past it.
-            Addr
-            getAddr()
-            {
-                Addr addr = argPointer;
-                argPointer += sizeof(Arg);
-                return addr;
-            }
-
-            // Read the value in an argument slot and move past it.
-            Arg
-            get(ThreadContext *tc)
-            {
-                Arg arg = ArmSemihosting::portProxy(tc).read<Arg>(
-                        argPointer, endian);
-                argPointer += sizeof(Arg);
-                return arg;
-            }
-
-            using ArgType = Arg;
-        };
-    };
+        return portProxyImpl(tc);
+    }
+    ByteOrder byteOrder(ThreadContext *tc) const override
+    {
+        return ArmISA::byteOrder(tc);
+    }
 
     struct Abi64 : public AbiBase
     {
         using UintPtr = uint64_t;
 
-        class State : public StateBase<uint64_t>
+        class State : public StateBase<Abi64::UintPtr, ArmSemihosting>
         {
           public:
             // For 64 bit semihosting, the params are pointer to by X1.
-            explicit State(const ThreadContext *tc) :
-                StateBase<uint64_t>(tc, tc->getReg(ArmISA::int_reg::X1))
+            explicit
+            State(const ThreadContext *tc) :
+                StateBase<Abi64::UintPtr, ArmSemihosting>(tc,
+                        tc->getReg(ArmISA::int_reg::X1), &ArmISA::byteOrder)
             {}
         };
     };
@@ -153,54 +107,18 @@ class ArmSemihosting : public SimObject
     {
         using UintPtr = uint32_t;
 
-        class State : public StateBase<uint64_t>
+        class State : public StateBase<Abi32::UintPtr, ArmSemihosting>
         {
           public:
             // For 32 bit semihosting, the params are pointer to by R1.
-            explicit State(const ThreadContext *tc) :
-                StateBase<uint64_t>(tc, tc->getReg(ArmISA::int_reg::R1))
+            explicit
+            State(const ThreadContext *tc) :
+                StateBase<Abi32::UintPtr, ArmSemihosting>(tc,
+                        tc->getReg(ArmISA::int_reg::R1), &ArmISA::byteOrder)
             {}
         };
     };
 
-    // Use this argument type when you need to modify an argument in place.
-    // This will give you the address of the argument itself and the size of
-    // each argument slot, rather than the actual value of the argument.
-    struct InPlaceArg
-    {
-        Addr addr;
-        size_t size;
-
-        InPlaceArg(Addr _addr, size_t _size) : addr(_addr), size(_size) {}
-
-        // A helper function to read the argument since the guest ABI mechanism
-        // didn't do that for us.
-        uint64_t
-        read(ThreadContext *tc, ByteOrder endian)
-        {
-            auto &proxy = ArmSemihosting::portProxy(tc);
-            if (size == 8)
-                return proxy.read<uint64_t>(addr, endian);
-            else if (size == 4)
-                return proxy.read<uint32_t>(addr, endian);
-            else
-                panic("Unexpected semihosting argument size %d.", size);
-        }
-
-        // A helper function to write to the argument's slot in the params.
-        void
-        write(ThreadContext *tc, uint64_t val, ByteOrder endian)
-        {
-            auto &proxy = ArmSemihosting::portProxy(tc);
-            if (size == 8)
-                proxy.write<uint64_t>(addr, val, endian);
-            else if (size == 4)
-                proxy.write<uint32_t>(addr, val, endian);
-            else
-                panic("Unexpected semihosting argument size %d.", size);
-        }
-    };
-
     enum Operation
     {
         SYS_OPEN = 0x01,
@@ -233,397 +151,53 @@ class ArmSemihosting : public SimObject
         SYS_GEM5_PSEUDO_OP = 0x100
     };
 
-    ArmSemihosting(const ArmSemihostingParams &p);
+    using SemiCall = SemiCallBase<ArmSemihosting, Abi32, Abi64>;
+
+    explicit ArmSemihosting(const ArmSemihostingParams &p);
 
     /** Perform an Arm Semihosting call from aarch64 code. */
     bool call64(ThreadContext *tc, bool gem5_ops);
     /** Perform an Arm Semihosting call from aarch32 code. */
     bool call32(ThreadContext *tc, bool gem5_ops);
 
-  public: // SimObject and related interfaces
-    void serialize(CheckpointOut &cp) const override;
-    void unserialize(CheckpointIn &cp) override;
-
-  protected: // Configuration
-    const std::string cmdLine;
-    const Addr memReserve;
-    const Addr stackSize;
-
-    /**
-     * Base time when the simulation started. This is used to
-     * calculate the time of date when the guest call SYS_TIME.
-     */
-    const time_t timeBase;
-
-    /** Number of bits to right shift gem5 ticks to fit in a uint32_t */
-    const unsigned tickShift;
-
-  protected: // Internal state
-    typedef uint64_t SemiErrno;
-    SemiErrno semiErrno;
-
-  protected: // File IO
-    /**
-     * Internal state for open files
-     *
-     * This class describes the internal state of a file opened
-     * through the semihosting interface.
-     *
-     * A file instance is normally created using one of the
-     * ArmSemihosting::FileBase::create() factory methods. These
-     * methods handle some the magic file names in the Arm Semihosting
-     * specification and instantiate the right implementation. For the
-     * same, when unserializing a checkpoint, the create method must
-     * be used to unserialize a new instance of a file descriptor.
-     */
-    class FileBase : public Serializable
-    {
-      public:
-        FileBase(ArmSemihosting &_parent, const char *name, const char *_mode)
-            : parent(_parent), _name(name), mode(_mode) {}
-        virtual ~FileBase() {};
-
-        FileBase() = delete;
-        FileBase(FileBase &) = delete;
-
-        static std::unique_ptr<FileBase> create(
-            ArmSemihosting &parent, const std::string &fname,
-            const char *mode);
-        static std::unique_ptr<FileBase> create(
-            ArmSemihosting &parent, CheckpointIn &cp, const std::string &sec);
-
-        void serialize(CheckpointOut &cp) const override;
-        void unserialize(CheckpointIn &cp) override;
-
-        const std::string &fileName() { return _name; }
-
-      public:
-        /** @{
-         * Semihosting file IO interfaces
-         *
-         * These interfaces implement common IO functionality in the
-         * Semihosting interface.
-         *
-         * All functions return a negative value that corresponds to a
-         * UNIX errno value when they fail and >=0 on success.
-         */
-
-        /**
-         * Open the the file.
-         *
-         * @return <0 on error (-errno), 0 on success.
-         */
-        virtual int64_t open() { return 0; }
-
-        /**
-         * Close the file.
-         *
-         * @return <0 on error (-errno), 0 on success.
-         */
-        virtual int64_t close() { return 0; }
-
-        /**
-         * Check if a file corresponds to a TTY device.
-         *
-         * @return True if the file is a TTY, false otherwise.
-         */
-        virtual bool isTTY() const { return false; }
-
-        /**
-         * Read data from file.
-         *
-         * @return <0 on error (-errno), bytes read on success (0 for EOF).
-         */
-        virtual int64_t read(uint8_t *buffer, uint64_t size);
-
-        /**
-         * Write data to file.
-         *
-         * @return <0 on error (-errno), bytes written on success.
-         */
-        virtual int64_t write(const uint8_t *buffer, uint64_t size);
-
-        /**
-         * Seek to an absolute position in the file.
-         *
-         * @param pos Byte offset from start of file.
-         * @return <0 on error (-errno), 0 on success.
-         */
-        virtual int64_t seek(uint64_t pos);
-
-        /**
-         * Get the length of a file in bytes.
-         *
-         * @return <0 on error (-errno), length on success
-         */
-        virtual int64_t flen();
-
-        /** @} */
-
-      protected:
-        ArmSemihosting &parent;
-        std::string _name;
-        std::string mode;
-    };
-
-    /** Implementation of the ':semihosting-features' magic file. */
-    class FileFeatures : public FileBase
-    {
-      public:
-        FileFeatures(ArmSemihosting &_parent,
-                     const char *name, const char *mode);
-
-        void serialize(CheckpointOut &cp) const override;
-        void unserialize(CheckpointIn &cp) override;
-
-        int64_t read(uint8_t *buffer, uint64_t size) override;
-        int64_t seek(uint64_t pos) override;
-
-      protected:
-        size_t pos;
-    };
-
-    class File : public FileBase
-    {
-      public:
-        File(ArmSemihosting &_parent, const char *name, const char *mode);
-        ~File();
-
-        void serialize(CheckpointOut &cp) const override;
-        void unserialize(CheckpointIn &cp) override;
-
-        int64_t open() override { return openImpl(false); }
-        int64_t close() override;
-        bool isTTY() const override;
-        int64_t read(uint8_t *buffer, uint64_t size) override;
-        int64_t write(const uint8_t *buffer, uint64_t size) override;
-        int64_t seek(uint64_t pos) override;
-        int64_t flen() override;
-
-      protected:
-        int64_t openImpl(bool unserialize);
-        bool needClose() const { return !isTTY(); }
-
-        FILE *file;
-    };
-
-    std::string filesRootDir;
-    std::vector<std::unique_ptr<FileBase>> files;
-    using Handle = size_t;
-    FILE *stdin;
-    FILE *stdout;
-    FILE *stderr;
-
-  protected: // Helper functions
-    unsigned
-    calcTickShift() const
-    {
-        int msb = findMsbSet(sim_clock::Frequency);
-        return msb > 31 ? msb - 31 : 0;
-    }
-    uint64_t
-    semiTick(Tick tick) const
-    {
-        return tick >> tickShift;
-    }
-    void semiExit(uint64_t code, uint64_t subcode);
-    std::string readString(ThreadContext *tc, Addr ptr, size_t len);
-
-  public:
-    typedef std::pair<uint64_t, SemiErrno> RetErrno;
-
-  private:
-    static RetErrno
-    retError(SemiErrno e)
-    {
-        return RetErrno((uint64_t)-1, e);
-    }
-
-    static RetErrno
-    retOK(uint64_t r)
-    {
-        return RetErrno(r, 0);
-    }
-
-    /**
-     * Semihosting call information structure.
-     *
-     * This structure describes how a semi-hosting call is
-     * implemented. It contains debug information (e.g., the name of
-     * the call), and a way to invoke it in a particular context.
-     */
-    struct SemiCall
-    {
-        /** Call name */
-        const char *name;
-
-        // A type for member functions implementing semihosting calls.
-        template <typename ...Args>
-        using Implementation =
-            RetErrno (ArmSemihosting::*)(ThreadContext *tc, Args... args);
-
-        // Since guest ABI doesn't know how to call member function pointers,
-        // this template builds a wrapper that takes care of that.
-        template <typename ...Args>
-        static inline std::function<RetErrno(ThreadContext *tc, Args... args)>
-        wrapImpl(ArmSemihosting *sh, Implementation<Args...> impl)
-        {
-            return [sh, impl](ThreadContext *tc, Args... args) {
-                return (sh->*impl)(tc, args...);
-            };
-        }
-
-        // A type for functions which dispatch semihosting calls through the
-        // guest ABI mechanism.
-        using Dispatcher =
-            std::function<RetErrno(ArmSemihosting *sh, ThreadContext *tc)>;
-        using Dumper = std::function<std::string(ThreadContext *tc)>;
-
-        // Dispatchers for 32 and 64 bits.
-        Dispatcher call32;
-        Dispatcher call64;
-
-        // Dumpers which print semihosting calls and their arguments.
-        Dumper dump32;
-        Dumper dump64;
-
-        // A function which builds a dispatcher for a semihosting call.
-        template <typename Abi, typename ...Args>
-        static inline Dispatcher
-        buildDispatcher(Implementation<Args...> impl)
-        {
-            // This lambda is the dispatcher we're building.
-            return [impl](ArmSemihosting *sh, ThreadContext *tc) {
-                auto wrapper = wrapImpl(sh, impl);
-                return invokeSimcall<Abi>(tc, wrapper);
-            };
-        }
-
-        // A function which builds a dumper for a semihosting call.
-        template <typename Abi, typename ...Args>
-        static inline Dumper
-        buildDumper(const char *name, Implementation<Args...> impl)
-        {
-            // This lambda is the dumper we're building.
-            return [name](ThreadContext *tc) -> std::string {
-                return dumpSimcall<Abi, RetErrno, Args...>(name, tc);
-            };
-        }
-
-        // When there's one implementation, use it for both 32 and 64 bits.
-        template <typename ...Args>
-        SemiCall(const char *_name, Implementation<Args...> common) :
-            name(_name), call32(buildDispatcher<Abi32>(common)),
-            call64(buildDispatcher<Abi64>(common)),
-            dump32(buildDumper<Abi32>(_name, common)),
-            dump64(buildDumper<Abi64>(_name, common))
-        {}
-
-        // When there are two, use one for 32 bits and one for 64 bits.
-        template <typename ...Args32, typename ...Args64>
-        SemiCall(const char *_name, Implementation<Args32...> impl32,
-                 Implementation<Args64...> impl64) :
-            name(_name), call32(buildDispatcher<Abi32>(impl32)),
-            call64(buildDispatcher<Abi64>(impl64)),
-            dump32(buildDumper<Abi32>(_name, impl32)),
-            dump64(buildDumper<Abi64>(_name, impl64))
-        {}
-    };
-
-    RetErrno callOpen(ThreadContext *tc, const Addr name_base,
-                      int fmode, size_t name_size);
-    RetErrno callClose(ThreadContext *tc, Handle handle);
-    RetErrno callWriteC(ThreadContext *tc, InPlaceArg c);
-    RetErrno callWrite0(ThreadContext *tc, InPlaceArg str);
-    RetErrno callWrite(ThreadContext *tc, Handle handle,
-                       Addr buffer, size_t size);
-    RetErrno callRead(ThreadContext *tc, Handle handle,
-                      Addr buffer, size_t size);
-    RetErrno callReadC(ThreadContext *tc);
-    RetErrno callIsError(ThreadContext *tc, int64_t status);
-    RetErrno callIsTTY(ThreadContext *tc, Handle handle);
-    RetErrno callSeek(ThreadContext *tc, Handle handle, uint64_t pos);
-    RetErrno callFLen(ThreadContext *tc, Handle handle);
-    RetErrno callTmpNam(ThreadContext *tc, Addr buffer,
-                        uint64_t id, size_t size);
-    RetErrno callRemove(ThreadContext *tc, Addr name_base, size_t name_size);
-    RetErrno callRename(ThreadContext *tc, Addr from_addr, size_t from_size,
-                        Addr to_addr, size_t to_size);
-    RetErrno callClock(ThreadContext *tc);
-    RetErrno callTime(ThreadContext *tc);
-    RetErrno callSystem(ThreadContext *tc, Addr cmd_addr, size_t cmd_size);
-    RetErrno callErrno(ThreadContext *tc);
-    RetErrno callGetCmdLine(ThreadContext *tc, Addr addr, InPlaceArg size_arg);
-
-    void gatherHeapInfo(ThreadContext *tc, bool aarch64,
-                        Addr &heap_base, Addr &heap_limit,
-                        Addr &stack_base, Addr &stack_limit);
-    RetErrno callHeapInfo32(ThreadContext *tc, Addr block_addr);
-    RetErrno callHeapInfo64(ThreadContext *tc, Addr block_addr);
-    RetErrno callExit32(ThreadContext *tc, InPlaceArg code);
-    RetErrno callExit64(ThreadContext *tc, uint64_t code, uint64_t subcode);
-    RetErrno callExitExtended(ThreadContext *tc, uint64_t code,
-                              uint64_t subcode);
-
-    RetErrno callElapsed32(ThreadContext *tc, InPlaceArg low, InPlaceArg high);
-    RetErrno callElapsed64(ThreadContext *tc, InPlaceArg ticks);
-    RetErrno callTickFreq(ThreadContext *tc);
-
+  protected:
     RetErrno callGem5PseudoOp32(ThreadContext *tc, uint32_t encoded_func);
     RetErrno callGem5PseudoOp64(ThreadContext *tc, uint64_t encoded_func);
 
-    template <typename Abi>
-    void
-    unrecognizedCall(ThreadContext *tc, const char *format, uint64_t op)
-    {
-        warn(format, op);
-        std::function<RetErrno(ThreadContext *tc)> retErr =
-            [](ThreadContext *tc) { return retError(EINVAL); };
-        invokeSimcall<Abi>(tc, retErr);
-    }
-
-    static FILE *getSTDIO(const char *stream_name,
-                          const std::string &name, const char *mode);
-
     static const std::map<uint32_t, SemiCall> calls;
-    static const std::vector<const char *> fmodes;
-    static const std::map<uint64_t, const char *> exitCodes;
-    static const std::vector<uint8_t> features;
-    static const std::map<const std::string, FILE *> stdioMap;
-
-    // used in callTmpNam() to deterministically generate a temp filename
-    uint16_t tmpNameIndex = 0;
-
 };
 
-std::ostream &operator << (
-        std::ostream &os, const ArmSemihosting::InPlaceArg &ipa);
-
 namespace guest_abi
 {
 
 template <typename Arg>
 struct Argument<ArmSemihosting::Abi64, Arg,
-    typename std::enable_if_t<std::is_integral_v<Arg>>>
+    typename std::enable_if_t<
+        (std::is_integral_v<Arg> ||
+         std::is_same<Arg,pseudo_inst::GuestAddr>::value)>>
 {
     static Arg
     get(ThreadContext *tc, ArmSemihosting::Abi64::State &state)
     {
-        return state.get(tc);
+        return (Arg)state.get(tc);
     }
 };
 
 template <typename Arg>
 struct Argument<ArmSemihosting::Abi32, Arg,
-    typename std::enable_if_t<std::is_integral_v<Arg>>>
+    typename std::enable_if_t<
+        (std::is_integral_v<Arg> ||
+         std::is_same<Arg,pseudo_inst::GuestAddr>::value)>>
 {
     static Arg
     get(ThreadContext *tc, ArmSemihosting::Abi32::State &state)
     {
-        if (std::is_signed_v<Arg>)
-            return sext<32>(state.get(tc));
-        else
-            return state.get(tc);
+        if (std::is_signed_v<Arg>) {
+            return (Arg)sext<32>(state.get(tc));
+        }
+        else {
+            return (Arg)state.get(tc);
+        }
     }
 };
 
diff --git a/src/arch/arm/table_walker.cc b/src/arch/arm/table_walker.cc
index 5938755d86..f16c065839 100644
--- a/src/arch/arm/table_walker.cc
+++ b/src/arch/arm/table_walker.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010, 2012-2019, 2021-2023 Arm Limited
+ * Copyright (c) 2010, 2012-2019, 2021-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -41,6 +41,7 @@
 
 #include "arch/arm/faults.hh"
 #include "arch/arm/mmu.hh"
+#include "arch/arm/mpam.hh"
 #include "arch/arm/pagetable.hh"
 #include "arch/arm/system.hh"
 #include "arch/arm/tlb.hh"
@@ -62,7 +63,7 @@ using namespace ArmISA;
 TableWalker::TableWalker(const Params &p)
     : ClockedObject(p),
       requestorId(p.sys->getRequestorId(this)),
-      port(new Port(*this, requestorId)),
+      port(new Port(*this)),
       isStage2(p.is_stage2), tlb(NULL),
       currState(NULL), pending(false),
       numSquashable(p.num_squash_per_cycle),
@@ -78,7 +79,8 @@ TableWalker::TableWalker(const Params &p)
       doL3LongDescEvent([this]{ doL3LongDescriptorWrapper(); }, name()),
       LongDescEventByLevel { &doL0LongDescEvent, &doL1LongDescEvent,
                              &doL2LongDescEvent, &doL3LongDescEvent },
-      doProcessEvent([this]{ processWalkWrapper(); }, name())
+      doProcessEvent([this]{ processWalkWrapper(); }, name()),
+      test(nullptr)
 {
     sctlr = 0;
 
@@ -123,40 +125,35 @@ TableWalker::setMmu(MMU *_mmu)
 }
 
 TableWalker::WalkerState::WalkerState() :
-    tc(nullptr), aarch64(false), el(EL0), physAddrRange(0), req(nullptr),
-    asid(0), vmid(0), isHyp(false), transState(nullptr),
+    tc(nullptr), aarch64(false), regime(TranslationRegime::EL10),
+    physAddrRange(0), req(nullptr),
+    asid(0), vmid(0), transState(nullptr),
     vaddr(0), vaddr_tainted(0),
     sctlr(0), scr(0), cpsr(0), tcr(0),
     htcr(0), hcr(0), vtcr(0),
     isWrite(false), isFetch(false), isSecure(false),
-    isUncacheable(false),
-    secureLookup(false), rwTable(false), userTable(false), xnTable(false),
-    pxnTable(false), hpd(false), stage2Req(false),
+    isUncacheable(false), longDescData(std::nullopt),
+    hpd(false), sh(0), irgn(0), orgn(0), stage2Req(false),
     stage2Tran(nullptr), timing(false), functional(false),
     mode(BaseMMU::Read), tranType(MMU::NormalTran), l2Desc(l1Desc),
     delayed(false), tableWalker(nullptr)
 {
 }
 
-TableWalker::Port::Port(TableWalker& _walker, RequestorID id)
+TableWalker::Port::Port(TableWalker& _walker)
   : QueuedRequestPort(_walker.name() + ".port", reqQueue, snoopRespQueue),
     owner{_walker},
     reqQueue(_walker, *this),
-    snoopRespQueue(_walker, *this),
-    requestorId(id)
+    snoopRespQueue(_walker, *this)
 {
 }
 
 PacketPtr
 TableWalker::Port::createPacket(
-    Addr desc_addr, int size,
-    uint8_t *data, Request::Flags flags, Tick delay,
+    const RequestPtr &req,
+    uint8_t *data, Tick delay,
     Event *event)
 {
-    RequestPtr req = std::make_shared<Request>(
-        desc_addr, size, flags, requestorId);
-    req->taskId(context_switch_task_id::DMA);
-
     PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
     pkt->dataStatic(data);
 
@@ -170,10 +167,9 @@ TableWalker::Port::createPacket(
 
 void
 TableWalker::Port::sendFunctionalReq(
-    Addr desc_addr, int size,
-    uint8_t *data, Request::Flags flags)
+    const RequestPtr &req, uint8_t *data)
 {
-    auto pkt = createPacket(desc_addr, size, data, flags, 0, nullptr);
+    auto pkt = createPacket(req, data, 0, nullptr);
 
     sendFunctional(pkt);
 
@@ -182,10 +178,10 @@ TableWalker::Port::sendFunctionalReq(
 
 void
 TableWalker::Port::sendAtomicReq(
-    Addr desc_addr, int size,
-    uint8_t *data, Request::Flags flags, Tick delay)
+    const RequestPtr &req,
+    uint8_t *data, Tick delay)
 {
-    auto pkt = createPacket(desc_addr, size, data, flags, delay, nullptr);
+    auto pkt = createPacket(req, data, delay, nullptr);
 
     Tick lat = sendAtomic(pkt);
 
@@ -194,11 +190,11 @@ TableWalker::Port::sendAtomicReq(
 
 void
 TableWalker::Port::sendTimingReq(
-    Addr desc_addr, int size,
-    uint8_t *data, Request::Flags flags, Tick delay,
+    const RequestPtr &req,
+    uint8_t *data, Tick delay,
     Event *event)
 {
-    auto pkt = createPacket(desc_addr, size, data, flags, delay, event);
+    auto pkt = createPacket(req, data, delay, event);
 
     schedTimingReq(pkt, curTick());
 }
@@ -286,9 +282,18 @@ TableWalker::drainResume()
     }
 }
 
+bool
+TableWalker::uncacheableWalk() const
+{
+    bool disable_cacheability = isStage2 ?
+        currState->hcr.cd :
+        currState->sctlr.c == 0;
+    return disable_cacheability || currState->isUncacheable;
+}
+
 Fault
 TableWalker::walk(const RequestPtr &_req, ThreadContext *_tc, uint16_t _asid,
-                  vmid_t _vmid, bool _isHyp, MMU::Mode _mode,
+                  vmid_t _vmid, MMU::Mode _mode,
                   MMU::Translation *_trans, bool _timing, bool _functional,
                   bool secure, MMU::ArmTranslationType tranType,
                   bool _stage2Req, const TlbEntry *walk_entry)
@@ -331,17 +336,19 @@ TableWalker::walk(const RequestPtr &_req, ThreadContext *_tc, uint16_t _asid,
 
     currState->startTime = curTick();
     currState->tc = _tc;
-    // ARM DDI 0487A.f (ARMv8 ARM) pg J8-5672
-    // aarch32/translation/translation/AArch32.TranslateAddress dictates
-    // even AArch32 EL0 will use AArch64 translation if EL1 is in AArch64.
+    currState->el =
+        MMU::tranTypeEL(_tc->readMiscReg(MISCREG_CPSR),
+            _tc->readMiscReg(MISCREG_SCR_EL3),
+            tranType);
+
     if (isStage2) {
-        currState->el = EL1;
+        currState->regime = TranslationRegime::EL10;
         currState->aarch64 = ELIs64(_tc, EL2);
     } else {
-        currState->el =
-            MMU::tranTypeEL(_tc->readMiscReg(MISCREG_CPSR), tranType);
+        currState->regime =
+            translationRegime(_tc, currState->el);
         currState->aarch64 =
-            ELIs64(_tc, currState->el == EL0 ? EL1 : currState->el);
+            ELIs64(_tc, translationEl(currState->regime));
     }
     currState->transState = _trans;
     currState->req = _req;
@@ -353,12 +360,12 @@ TableWalker::walk(const RequestPtr &_req, ThreadContext *_tc, uint16_t _asid,
     currState->fault = NoFault;
     currState->asid = _asid;
     currState->vmid = _vmid;
-    currState->isHyp = _isHyp;
     currState->timing = _timing;
     currState->functional = _functional;
     currState->mode = _mode;
     currState->tranType = tranType;
     currState->isSecure = secure;
+    currState->secureLookup = secure;
     currState->physAddrRange = _physAddrRange;
 
     /** @todo These should be cached or grabbed from cached copies in
@@ -382,33 +389,24 @@ TableWalker::walk(const RequestPtr &_req, ThreadContext *_tc, uint16_t _asid,
                 currState->vtcr =
                     currState->tc->readMiscReg(MISCREG_VTCR_EL2);
             }
-        } else switch (currState->el) {
-          case EL0:
-            if (HaveExt(currState->tc, ArmExtension::FEAT_VHE) &&
-                  currState->hcr.tge == 1 && currState->hcr.e2h ==1) {
-              currState->sctlr = currState->tc->readMiscReg(MISCREG_SCTLR_EL2);
-              currState->tcr = currState->tc->readMiscReg(MISCREG_TCR_EL2);
-            } else {
-              currState->sctlr = currState->tc->readMiscReg(MISCREG_SCTLR_EL1);
-              currState->tcr = currState->tc->readMiscReg(MISCREG_TCR_EL1);
-            }
-            break;
-          case EL1:
+        } else switch (currState->regime) {
+          case TranslationRegime::EL10:
             currState->sctlr = currState->tc->readMiscReg(MISCREG_SCTLR_EL1);
             currState->tcr = currState->tc->readMiscReg(MISCREG_TCR_EL1);
             break;
-          case EL2:
+          case TranslationRegime::EL20:
+          case TranslationRegime::EL2:
             assert(release->has(ArmExtension::VIRTUALIZATION));
             currState->sctlr = currState->tc->readMiscReg(MISCREG_SCTLR_EL2);
             currState->tcr = currState->tc->readMiscReg(MISCREG_TCR_EL2);
             break;
-          case EL3:
+          case TranslationRegime::EL3:
             assert(release->has(ArmExtension::SECURITY));
             currState->sctlr = currState->tc->readMiscReg(MISCREG_SCTLR_EL3);
             currState->tcr = currState->tc->readMiscReg(MISCREG_TCR_EL3);
             break;
           default:
-            panic("Invalid exception level");
+            panic("Invalid translation regime");
             break;
         }
     } else {
@@ -429,56 +427,70 @@ TableWalker::walk(const RequestPtr &_req, ThreadContext *_tc, uint16_t _asid,
 
     currState->stage2Req = _stage2Req && !isStage2;
 
-    bool long_desc_format = currState->aarch64 || _isHyp || isStage2 ||
+    bool hyp = currState->el == EL2;
+    bool long_desc_format = currState->aarch64 || hyp || isStage2 ||
                             longDescFormatInUse(currState->tc);
 
     if (long_desc_format) {
         // Helper variables used for hierarchical permissions
-        currState->secureLookup = currState->isSecure;
-        currState->rwTable = true;
-        currState->userTable = true;
-        currState->xnTable = false;
-        currState->pxnTable = false;
-
+        currState->longDescData = WalkerState::LongDescData();
+        currState->longDescData->rwTable = true;
+        currState->longDescData->userTable = true;
+        currState->longDescData->xnTable = false;
+        currState->longDescData->pxnTable = false;
         ++stats.walksLongDescriptor;
     } else {
+        currState->longDescData = std::nullopt;
         ++stats.walksShortDescriptor;
     }
 
-    if (!currState->timing) {
+    if (currState->timing && (pending || pendingQueue.size())) {
+        pendingQueue.push_back(currState);
+        currState = NULL;
+        pendingChange();
+        return NoFault;
+    } else {
+        if (currState->timing) {
+            pending = true;
+            pendingChange();
+        }
+
         Fault fault = NoFault;
-        if (currState->aarch64)
+        if (currState->aarch64) {
             fault = processWalkAArch64();
-        else if (long_desc_format)
+        } else if (long_desc_format) {
             fault = processWalkLPAE();
-        else
+        } else {
             fault = processWalk();
+        }
 
         // If this was a functional non-timing access restore state to
         // how we found it.
         if (currState->functional) {
             delete currState;
             currState = savedCurrState;
+        } else if (currState->timing) {
+            if (fault) {
+                pending = false;
+                nextWalk(currState->tc);
+                delete currState;
+                currState = NULL;
+            } else {
+                // Either we are using the long descriptor, which means we
+                // need to extract the queue index from longDesc, or we are
+                // using the short. In the latter we always start at L1
+                LookupLevel curr_lookup_level = long_desc_format ?
+                    currState->longDesc.lookupLevel : LookupLevel::L1;
+
+                stashCurrState(curr_lookup_level);
+            }
+        } else if (fault) {
+            currState->tc = NULL;
+            currState->req = NULL;
         }
+
         return fault;
     }
-
-    if (pending || pendingQueue.size()) {
-        pendingQueue.push_back(currState);
-        currState = NULL;
-        pendingChange();
-    } else {
-        pending = true;
-        pendingChange();
-        if (currState->aarch64)
-            return processWalkAArch64();
-        else if (long_desc_format)
-            return processWalkLPAE();
-        else
-            return processWalk();
-    }
-
-    return NoFault;
 }
 
 void
@@ -492,8 +504,8 @@ TableWalker::processWalkWrapper()
     // Check if a previous walk filled this request already
     // @TODO Should this always be the TLB or should we look in the stage2 TLB?
     TlbEntry* te = mmu->lookup(currState->vaddr, currState->asid,
-        currState->vmid, currState->isHyp, currState->isSecure, true, false,
-        currState->el, false, isStage2, currState->mode);
+        currState->vmid, currState->isSecure, true, false,
+        currState->regime, isStage2, currState->mode);
 
     // Check if we still need to have a walk for this request. If the requesting
     // instruction has been squashed, or a previous walk has filled the TLB with
@@ -504,26 +516,36 @@ TableWalker::processWalkWrapper()
         // We've got a valid request, lets process it
         pending = true;
         pendingQueue.pop_front();
-        // Keep currState in case one of the processWalk... calls NULLs it
+
+        bool long_desc_format = currState->aarch64 || currState->el == EL2 ||
+            isStage2 || longDescFormatInUse(currState->tc);
 
         if (te && te->partial) {
             currState->walkEntry = *te;
         }
-        WalkerState *curr_state_copy = currState;
-        Fault f;
-        if (currState->aarch64)
-            f = processWalkAArch64();
-        else if (longDescFormatInUse(currState->tc) ||
-                 currState->isHyp || isStage2)
-            f = processWalkLPAE();
-        else
-            f = processWalk();
+        Fault fault;
+        if (currState->aarch64) {
+            fault = processWalkAArch64();
+        } else if (long_desc_format) {
+            fault = processWalkLPAE();
+        } else {
+            fault = processWalk();
+        }
 
-        if (f != NoFault) {
-            curr_state_copy->transState->finish(f, curr_state_copy->req,
-                    curr_state_copy->tc, curr_state_copy->mode);
+        if (fault != NoFault) {
+            pending = false;
+            nextWalk(currState->tc);
 
-            delete curr_state_copy;
+            currState->transState->finish(fault, currState->req,
+                    currState->tc, currState->mode);
+
+            delete currState;
+            currState = NULL;
+        } else {
+            LookupLevel curr_lookup_level = long_desc_format ?
+                currState->longDesc.lookupLevel : LookupLevel::L1;
+
+            stashCurrState(curr_lookup_level);
         }
         return;
     }
@@ -563,8 +585,8 @@ TableWalker::processWalkWrapper()
         if (pendingQueue.size()) {
             currState = pendingQueue.front();
             te = mmu->lookup(currState->vaddr, currState->asid,
-                currState->vmid, currState->isHyp, currState->isSecure, true,
-                false, currState->el, false, isStage2, currState->mode);
+                currState->vmid, currState->isSecure, true,
+                false, currState->regime, isStage2, currState->mode);
         } else {
             // Terminate the loop, nothing more to do
             currState = NULL;
@@ -650,42 +672,22 @@ TableWalker::processWalk()
     DPRINTF(TLB, " - Descriptor at address %#x (%s)\n", l1desc_addr,
             currState->isSecure ? "s" : "ns");
 
-    // Trickbox address check
-    Fault f;
-    f = testWalk(l1desc_addr, sizeof(uint32_t),
-                 TlbEntry::DomainType::NoAccess, LookupLevel::L1, isStage2);
-    if (f) {
-        DPRINTF(TLB, "Trickbox check caused fault on %#x\n", currState->vaddr_tainted);
-        if (currState->timing) {
-            pending = false;
-            nextWalk(currState->tc);
-            currState = NULL;
-        } else {
-            currState->tc = NULL;
-            currState->req = NULL;
-        }
-        return f;
-    }
-
     Request::Flags flag = Request::PT_WALK;
-    if (currState->sctlr.c == 0 || currState->isUncacheable) {
+    if (uncacheableWalk()) {
         flag.set(Request::UNCACHEABLE);
     }
 
-    if (currState->isSecure) {
+    if (currState->secureLookup) {
         flag.set(Request::SECURE);
     }
 
-    bool delayed;
-    delayed = fetchDescriptor(l1desc_addr, (uint8_t*)&currState->l1Desc.data,
-                              sizeof(uint32_t), flag, LookupLevel::L1,
-                              &doL1DescEvent,
-                              &TableWalker::doL1Descriptor);
-    if (!delayed) {
-       f = currState->fault;
-    }
+    fetchDescriptor(
+        l1desc_addr, currState->l1Desc,
+        sizeof(uint32_t), flag, LookupLevel::L1,
+        &doL1DescEvent,
+        &TableWalker::doL1Descriptor);
 
-    return f;
+    return currState->fault;
 }
 
 Fault
@@ -701,7 +703,7 @@ TableWalker::processWalkLPAE()
     stats.walkWaitTime.sample(curTick() - currState->startTime);
 
     Request::Flags flag = Request::PT_WALK;
-    if (currState->isSecure)
+    if (currState->secureLookup)
         flag.set(Request::SECURE);
 
     // work out which base address register to use, if in hyp mode we always
@@ -713,7 +715,7 @@ TableWalker::processWalkLPAE()
         start_lookup_level = currState->vtcr.sl0 ?
             LookupLevel::L1 : LookupLevel::L2;
         currState->isUncacheable = currState->vtcr.irgn0 == 0;
-    } else if (currState->isHyp) {
+    } else if (currState->el == EL2) {
         DPRINTF(TLB, " - Selecting HTTBR (long-desc.)\n");
         ttbr = currState->tc->readMiscReg(MISCREG_HTTBR);
         tsz  = currState->htcr.t0sz;
@@ -826,24 +828,7 @@ TableWalker::processWalkLPAE()
                 desc_addr, currState->isSecure ? "s" : "ns");
     }
 
-    // Trickbox address check
-    Fault f = testWalk(desc_addr, sizeof(uint64_t),
-                       TlbEntry::DomainType::NoAccess, start_lookup_level,
-                       isStage2);
-    if (f) {
-        DPRINTF(TLB, "Trickbox check caused fault on %#x\n", currState->vaddr_tainted);
-        if (currState->timing) {
-            pending = false;
-            nextWalk(currState->tc);
-            currState = NULL;
-        } else {
-            currState->tc = NULL;
-            currState->req = NULL;
-        }
-        return f;
-    }
-
-    if (currState->sctlr.c == 0 || currState->isUncacheable) {
+    if (uncacheableWalk()) {
         flag.set(Request::UNCACHEABLE);
     }
 
@@ -851,15 +836,13 @@ TableWalker::processWalkLPAE()
     currState->longDesc.aarch64 = false;
     currState->longDesc.grainSize = Grain4KB;
 
-    bool delayed = fetchDescriptor(desc_addr, (uint8_t*)&currState->longDesc.data,
-                                   sizeof(uint64_t), flag, start_lookup_level,
-                                   LongDescEventByLevel[start_lookup_level],
-                                   &TableWalker::doLongDescriptor);
-    if (!delayed) {
-        f = currState->fault;
-    }
+    fetchDescriptor(
+        desc_addr, currState->longDesc,
+        sizeof(uint64_t), flag, start_lookup_level,
+        LongDescEventByLevel[start_lookup_level],
+        &TableWalker::doLongDescriptor);
 
-    return f;
+    return currState->fault;
 }
 
 bool
@@ -910,55 +893,8 @@ TableWalker::processWalkAArch64()
         currState->el);
 
     bool vaddr_fault = false;
-    switch (currState->el) {
-      case EL0:
-        {
-            Addr ttbr0;
-            Addr ttbr1;
-            if (HaveExt(currState->tc, ArmExtension::FEAT_VHE) &&
-                    currState->hcr.tge==1 && currState->hcr.e2h == 1) {
-                // VHE code for EL2&0 regime
-                ttbr0 = currState->tc->readMiscReg(MISCREG_TTBR0_EL2);
-                ttbr1 = currState->tc->readMiscReg(MISCREG_TTBR1_EL2);
-            } else {
-                ttbr0 = currState->tc->readMiscReg(MISCREG_TTBR0_EL1);
-                ttbr1 = currState->tc->readMiscReg(MISCREG_TTBR1_EL1);
-            }
-            switch (bits(currState->vaddr, 63,48)) {
-              case 0:
-                DPRINTF(TLB, " - Selecting TTBR0 (AArch64)\n");
-                ttbr = ttbr0;
-                tsz = 64 - currState->tcr.t0sz;
-                tg = GrainMap_tg0[currState->tcr.tg0];
-                currState->hpd = currState->tcr.hpd0;
-                currState->isUncacheable = currState->tcr.irgn0 == 0;
-                vaddr_fault = checkVAddrSizeFaultAArch64(currState->vaddr,
-                    top_bit, tg, tsz, true);
-
-                if (vaddr_fault || currState->tcr.epd0)
-                    fault = true;
-                break;
-              case 0xffff:
-                DPRINTF(TLB, " - Selecting TTBR1 (AArch64)\n");
-                ttbr = ttbr1;
-                tsz = 64 - currState->tcr.t1sz;
-                tg = GrainMap_tg1[currState->tcr.tg1];
-                currState->hpd = currState->tcr.hpd1;
-                currState->isUncacheable = currState->tcr.irgn1 == 0;
-                vaddr_fault = checkVAddrSizeFaultAArch64(currState->vaddr,
-                    top_bit, tg, tsz, false);
-
-                if (vaddr_fault || currState->tcr.epd1)
-                    fault = true;
-                break;
-              default:
-                // top two bytes must be all 0s or all 1s, else invalid addr
-                fault = true;
-            }
-            ps = currState->tcr.ips;
-        }
-        break;
-      case EL1:
+    switch (currState->regime) {
+      case TranslationRegime::EL10:
         if (isStage2) {
             if (currState->secureLookup) {
                 DPRINTF(TLB, " - Selecting VSTTBR_EL2 (AArch64 stage 2)\n");
@@ -971,7 +907,9 @@ TableWalker::processWalkAArch64()
             tg = GrainMap_tg0[currState->vtcr.tg0];
 
             ps = currState->vtcr.ps;
-            currState->isUncacheable = currState->vtcr.irgn0 == 0;
+            currState->sh = currState->vtcr.sh0;
+            currState->irgn = currState->vtcr.irgn0;
+            currState->orgn = currState->vtcr.orgn0;
         } else {
             switch (bits(currState->vaddr, top_bit)) {
               case 0:
@@ -980,7 +918,9 @@ TableWalker::processWalkAArch64()
                 tsz = 64 - currState->tcr.t0sz;
                 tg = GrainMap_tg0[currState->tcr.tg0];
                 currState->hpd = currState->tcr.hpd0;
-                currState->isUncacheable = currState->tcr.irgn0 == 0;
+                currState->sh = currState->tcr.sh0;
+                currState->irgn = currState->tcr.irgn0;
+                currState->orgn = currState->tcr.orgn0;
                 vaddr_fault = checkVAddrSizeFaultAArch64(currState->vaddr,
                     top_bit, tg, tsz, true);
 
@@ -993,7 +933,9 @@ TableWalker::processWalkAArch64()
                 tsz = 64 - currState->tcr.t1sz;
                 tg = GrainMap_tg1[currState->tcr.tg1];
                 currState->hpd = currState->tcr.hpd1;
-                currState->isUncacheable = currState->tcr.irgn1 == 0;
+                currState->sh = currState->tcr.sh1;
+                currState->irgn = currState->tcr.irgn1;
+                currState->orgn = currState->tcr.orgn1;
                 vaddr_fault = checkVAddrSizeFaultAArch64(currState->vaddr,
                     top_bit, tg, tsz, false);
 
@@ -1007,7 +949,8 @@ TableWalker::processWalkAArch64()
             ps = currState->tcr.ips;
         }
         break;
-      case EL2:
+      case TranslationRegime::EL2:
+      case TranslationRegime::EL20:
         switch(bits(currState->vaddr, top_bit)) {
           case 0:
             DPRINTF(TLB, " - Selecting TTBR0_EL2 (AArch64)\n");
@@ -1016,7 +959,9 @@ TableWalker::processWalkAArch64()
             tg = GrainMap_tg0[currState->tcr.tg0];
             currState->hpd = currState->hcr.e2h ?
                 currState->tcr.hpd0 : currState->tcr.hpd;
-            currState->isUncacheable = currState->tcr.irgn0 == 0;
+            currState->sh = currState->tcr.sh0;
+            currState->irgn = currState->tcr.irgn0;
+            currState->orgn = currState->tcr.orgn0;
             vaddr_fault = checkVAddrSizeFaultAArch64(currState->vaddr,
                 top_bit, tg, tsz, true);
 
@@ -1030,7 +975,9 @@ TableWalker::processWalkAArch64()
             tsz = 64 - currState->tcr.t1sz;
             tg = GrainMap_tg1[currState->tcr.tg1];
             currState->hpd = currState->tcr.hpd1;
-            currState->isUncacheable = currState->tcr.irgn1 == 0;
+            currState->sh = currState->tcr.sh1;
+            currState->irgn = currState->tcr.irgn1;
+            currState->orgn = currState->tcr.orgn1;
             vaddr_fault = checkVAddrSizeFaultAArch64(currState->vaddr,
                 top_bit, tg, tsz, false);
 
@@ -1044,7 +991,7 @@ TableWalker::processWalkAArch64()
         }
         ps = currState->hcr.e2h ? currState->tcr.ips: currState->tcr.ps;
         break;
-      case EL3:
+      case TranslationRegime::EL3:
         switch(bits(currState->vaddr, top_bit)) {
           case 0:
             DPRINTF(TLB, " - Selecting TTBR0_EL3 (AArch64)\n");
@@ -1052,7 +999,9 @@ TableWalker::processWalkAArch64()
             tsz = 64 - currState->tcr.t0sz;
             tg = GrainMap_tg0[currState->tcr.tg0];
             currState->hpd = currState->tcr.hpd;
-            currState->isUncacheable = currState->tcr.irgn0 == 0;
+            currState->sh = currState->tcr.sh0;
+            currState->irgn = currState->tcr.irgn0;
+            currState->orgn = currState->tcr.orgn0;
             vaddr_fault = checkVAddrSizeFaultAArch64(currState->vaddr,
                 top_bit, tg, tsz, true);
 
@@ -1067,33 +1016,25 @@ TableWalker::processWalkAArch64()
         break;
     }
 
+    currState->isUncacheable = currState->irgn == 0 ||
+                               currState->orgn == 0;
+
     const bool is_atomic = currState->req->isAtomic();
 
     if (fault) {
-        Fault f;
-        if (currState->isFetch)
-            f =  std::make_shared<PrefetchAbort>(
+        if (currState->isFetch) {
+            return std::make_shared<PrefetchAbort>(
                 currState->vaddr_tainted,
                 ArmFault::TranslationLL + LookupLevel::L0, isStage2,
                 ArmFault::LpaeTran);
-        else
-            f = std::make_shared<DataAbort>(
+        } else {
+            return std::make_shared<DataAbort>(
                 currState->vaddr_tainted,
                 TlbEntry::DomainType::NoAccess,
                 is_atomic ? false : currState->isWrite,
                 ArmFault::TranslationLL + LookupLevel::L0,
                 isStage2, ArmFault::LpaeTran);
-
-        if (currState->timing) {
-            pending = false;
-            nextWalk(currState->tc);
-            currState = NULL;
-        } else {
-            currState->tc = NULL;
-            currState->req = NULL;
         }
-        return f;
-
     }
 
     if (tg == ReservedGrain) {
@@ -1117,57 +1058,28 @@ TableWalker::processWalkAArch64()
     // necessary
     if (checkAddrSizeFaultAArch64(table_addr, currState->physAddrRange)) {
         DPRINTF(TLB, "Address size fault before any lookup\n");
-        Fault f;
         if (currState->isFetch)
-            f = std::make_shared<PrefetchAbort>(
+            return std::make_shared<PrefetchAbort>(
                 currState->vaddr_tainted,
                 ArmFault::AddressSizeLL + start_lookup_level,
                 isStage2,
                 ArmFault::LpaeTran);
         else
-            f = std::make_shared<DataAbort>(
+            return std::make_shared<DataAbort>(
                 currState->vaddr_tainted,
                 TlbEntry::DomainType::NoAccess,
                 is_atomic ? false : currState->isWrite,
                 ArmFault::AddressSizeLL + start_lookup_level,
                 isStage2,
                 ArmFault::LpaeTran);
-
-
-        if (currState->timing) {
-            pending = false;
-            nextWalk(currState->tc);
-            currState = NULL;
-        } else {
-            currState->tc = NULL;
-            currState->req = NULL;
-        }
-        return f;
-
-    }
-
-    // Trickbox address check
-    Fault f = testWalk(desc_addr, sizeof(uint64_t),
-                       TlbEntry::DomainType::NoAccess, start_lookup_level, isStage2);
-    if (f) {
-        DPRINTF(TLB, "Trickbox check caused fault on %#x\n", currState->vaddr_tainted);
-        if (currState->timing) {
-            pending = false;
-            nextWalk(currState->tc);
-            currState = NULL;
-        } else {
-            currState->tc = NULL;
-            currState->req = NULL;
-        }
-        return f;
     }
 
     Request::Flags flag = Request::PT_WALK;
-    if (currState->sctlr.c == 0 || currState->isUncacheable) {
+    if (uncacheableWalk()) {
         flag.set(Request::UNCACHEABLE);
     }
 
-    if (currState->isSecure) {
+    if (currState->secureLookup) {
         flag.set(Request::SECURE);
     }
 
@@ -1176,18 +1088,12 @@ TableWalker::processWalkAArch64()
     currState->longDesc.grainSize = tg;
     currState->longDesc.physAddrRange = _physAddrRange;
 
-    if (currState->timing) {
-        fetchDescriptor(desc_addr, (uint8_t*) &currState->longDesc.data,
-                        sizeof(uint64_t), flag, start_lookup_level,
-                        LongDescEventByLevel[start_lookup_level], NULL);
-    } else {
-        fetchDescriptor(desc_addr, (uint8_t*)&currState->longDesc.data,
-                        sizeof(uint64_t), flag, -1, NULL,
-                        &TableWalker::doLongDescriptor);
-        f = currState->fault;
-    }
+    fetchDescriptor(desc_addr, currState->longDesc,
+                    sizeof(uint64_t), flag, start_lookup_level,
+                    LongDescEventByLevel[start_lookup_level],
+                    &TableWalker::doLongDescriptor);
 
-    return f;
+    return currState->fault;
 }
 
 std::tuple<Addr, Addr, TableWalker::LookupLevel>
@@ -1206,10 +1112,12 @@ TableWalker::walkAddresses(Addr ttbr, GrainSize tg, int tsz, int pa_range)
                 "Walk Cache hit: va=%#x, level=%d, table address=%#x\n",
                 currState->vaddr, entry->lookupLevel, entry->pfn);
 
-        currState->xnTable = entry->xn;
-        currState->pxnTable = entry->pxn;
-        currState->rwTable = bits(entry->ap, 1);
-        currState->userTable = bits(entry->ap, 0);
+        if (currState->longDescData.has_value()) {
+            currState->longDescData->xnTable = entry->xn;
+            currState->longDescData->pxnTable = entry->pxn;
+            currState->longDescData->rwTable = bits(entry->ap, 1);
+            currState->longDescData->userTable = bits(entry->ap, 0);
+        }
 
         table_addr = entry->pfn;
         first_level = (LookupLevel)(entry->lookupLevel + 1);
@@ -1608,19 +1516,18 @@ TableWalker::memAttrsAArch64(ThreadContext *tc, TlbEntry &te,
         uint8_t attrIndx = l_descriptor.attrIndx();
 
         DPRINTF(TLBVerbose, "memAttrsAArch64 AttrIndx:%#x sh:%#x\n", attrIndx, sh);
-        ExceptionLevel regime =  s1TranslationRegime(tc, currState->el);
 
         // Select MAIR
         uint64_t mair;
-        switch (regime) {
-          case EL0:
-          case EL1:
+        switch (currState->regime) {
+          case TranslationRegime::EL10:
             mair = tc->readMiscReg(MISCREG_MAIR_EL1);
             break;
-          case EL2:
+          case TranslationRegime::EL20:
+          case TranslationRegime::EL2:
             mair = tc->readMiscReg(MISCREG_MAIR_EL2);
             break;
-          case EL3:
+          case TranslationRegime::EL3:
             mair = tc->readMiscReg(MISCREG_MAIR_EL3);
             break;
           default:
@@ -1671,6 +1578,24 @@ TableWalker::memAttrsAArch64(ThreadContext *tc, TlbEntry &te,
     }
 }
 
+void
+TableWalker::memAttrsWalkAArch64(TlbEntry &te)
+{
+    te.mtype = TlbEntry::MemoryType::Normal;
+    if (uncacheableWalk()) {
+        te.shareable = 3;
+        te.outerAttrs = 0;
+        te.innerAttrs = 0;
+        te.nonCacheable = true;
+    } else {
+        te.shareable = currState->sh;
+        te.outerAttrs = currState->orgn;
+        te.innerAttrs = currState->irgn;
+        te.nonCacheable = (te.outerAttrs == 0 || te.outerAttrs == 2) &&
+            (te.innerAttrs == 0 || te.innerAttrs == 2);
+    }
+}
+
 void
 TableWalker::doL1Descriptor()
 {
@@ -1739,36 +1664,22 @@ TableWalker::doL1Descriptor()
             DPRINTF(TLB, "L1 descriptor points to page table at: %#x (%s)\n",
                     l2desc_addr, currState->isSecure ? "s" : "ns");
 
-            // Trickbox address check
-            currState->fault = testWalk(l2desc_addr, sizeof(uint32_t),
-                                        currState->l1Desc.domain(),
-                                        LookupLevel::L2, isStage2);
-
-            if (currState->fault) {
-                if (!currState->timing) {
-                    currState->tc = NULL;
-                    currState->req = NULL;
-                }
-                return;
-            }
-
             Request::Flags flag = Request::PT_WALK;
 
             if (currState->sctlr.c == 0 || currState->isUncacheable) {
                 flag.set(Request::UNCACHEABLE);
             }
 
-            if (currState->isSecure)
+            if (currState->secureLookup)
                 flag.set(Request::SECURE);
 
-            bool delayed;
-            delayed = fetchDescriptor(l2desc_addr,
-                                      (uint8_t*)&currState->l2Desc.data,
-                                      sizeof(uint32_t), flag, -1, &doL2DescEvent,
-                                      &TableWalker::doL2Descriptor);
-            if (delayed) {
-                currState->delayed = true;
-            }
+            fetchDescriptor(
+                l2desc_addr, currState->l2Desc,
+                sizeof(uint32_t), flag, LookupLevel::L2,
+                &doL2DescEvent,
+                &TableWalker::doL2Descriptor);
+
+            currState->delayed = currState->timing;
 
             return;
         }
@@ -1877,13 +1788,17 @@ TableWalker::doLongDescriptor()
             // Set hierarchical permission flags
             currState->secureLookup = currState->secureLookup &&
                 currState->longDesc.secureTable();
-            currState->rwTable = currState->rwTable &&
+            currState->longDescData->rwTable =
+                currState->longDescData->rwTable &&
                 (currState->longDesc.rwTable() || currState->hpd);
-            currState->userTable = currState->userTable &&
+            currState->longDescData->userTable =
+                currState->longDescData->userTable &&
                 (currState->longDesc.userTable() || currState->hpd);
-            currState->xnTable = currState->xnTable ||
+            currState->longDescData->xnTable =
+                currState->longDescData->xnTable ||
                 (currState->longDesc.xnTable() && !currState->hpd);
-            currState->pxnTable = currState->pxnTable ||
+            currState->longDescData->pxnTable =
+                currState->longDescData->pxnTable ||
                 (currState->longDesc.pxnTable() && !currState->hpd);
 
             // Set up next level lookup
@@ -1907,24 +1822,10 @@ TableWalker::doLongDescriptor()
                 return;
             }
 
-            // Trickbox address check
-            currState->fault = testWalk(
-                next_desc_addr, sizeof(uint64_t), TlbEntry::DomainType::Client,
-                toLookupLevel(currState->longDesc.lookupLevel +1), isStage2);
-
-            if (currState->fault) {
-                if (!currState->timing) {
-                    currState->tc = NULL;
-                    currState->req = NULL;
-                }
-                return;
-            }
-
             if (mmu->hasWalkCache()) {
                 insertPartialTableEntry(currState->longDesc);
             }
 
-
             Request::Flags flag = Request::PT_WALK;
             if (currState->secureLookup)
                 flag.set(Request::SECURE);
@@ -1948,13 +1849,12 @@ TableWalker::doLongDescriptor()
                 break;
             }
 
-            bool delayed;
-            delayed = fetchDescriptor(next_desc_addr, (uint8_t*)&currState->longDesc.data,
-                                      sizeof(uint64_t), flag, -1, event,
-                                      &TableWalker::doLongDescriptor);
-            if (delayed) {
-                 currState->delayed = true;
-            }
+            fetchDescriptor(
+                next_desc_addr, currState->longDesc,
+                sizeof(uint64_t), flag, L, event,
+                &TableWalker::doLongDescriptor);
+
+            currState->delayed = currState->timing;
         }
         return;
       default:
@@ -2075,7 +1975,7 @@ TableWalker::doL1DescriptorWrapper()
         delete currState;
     } else {
         // need to do L2 descriptor
-        stateQueues[LookupLevel::L2].push_back(currState);
+        stashCurrState(LookupLevel::L2);
     }
     currState = NULL;
 }
@@ -2201,7 +2101,7 @@ TableWalker::doLongDescriptorWrapper(LookupLevel curr_lookup_level)
         if (curr_lookup_level >= LookupLevel::Num_ArmLookupLevel - 1)
             panic("Max. number of lookups already reached in table walk\n");
         // Need to perform additional lookups
-        stateQueues[currState->longDesc.lookupLevel].push_back(currState);
+        stashCurrState(currState->longDesc.lookupLevel);
     }
     currState = NULL;
 }
@@ -2216,75 +2116,87 @@ TableWalker::nextWalk(ThreadContext *tc)
         completeDrain();
 }
 
-bool
-TableWalker::fetchDescriptor(Addr descAddr, uint8_t *data, int numBytes,
-    Request::Flags flags, int queueIndex, Event *event,
+void
+TableWalker::fetchDescriptor(Addr desc_addr,
+    DescriptorBase &descriptor, int num_bytes,
+    Request::Flags flags, LookupLevel lookup_level, Event *event,
     void (TableWalker::*doDescriptor)())
 {
-    bool isTiming = currState->timing;
+    uint8_t *data = descriptor.getRawPtr();
 
     DPRINTF(PageTableWalker,
             "Fetching descriptor at address: 0x%x stage2Req: %d\n",
-            descAddr, currState->stage2Req);
+            desc_addr, currState->stage2Req);
 
-    // If this translation has a stage 2 then we know descAddr is an IPA and
+    // If this translation has a stage 2 then we know desc_addr is an IPA and
     // needs to be translated before we can access the page table. Do that
     // check here.
     if (currState->stage2Req) {
         Fault fault;
 
-        if (isTiming) {
+        if (currState->timing) {
             auto *tran = new
                 Stage2Walk(*this, data, event, currState->vaddr,
                     currState->mode, currState->tranType);
             currState->stage2Tran = tran;
-            readDataTimed(currState->tc, descAddr, tran, numBytes, flags);
+            readDataTimed(currState->tc, desc_addr, tran, num_bytes, flags);
             fault = tran->fault;
+
+            if (fault != NoFault) {
+                currState->fault = fault;
+            }
         } else {
             fault = readDataUntimed(currState->tc,
-                currState->vaddr, descAddr, data, numBytes, flags,
+                currState->vaddr, desc_addr, data, num_bytes, flags,
                 currState->mode,
                 currState->tranType,
                 currState->functional);
-        }
 
-        if (fault != NoFault) {
-            currState->fault = fault;
-        }
-        if (isTiming) {
-            if (queueIndex >= 0) {
-                DPRINTF(PageTableWalker, "Adding to walker fifo: "
-                        "queue size before adding: %d\n",
-                        stateQueues[queueIndex].size());
-                stateQueues[queueIndex].push_back(currState);
-                currState = NULL;
+            if (fault != NoFault) {
+                currState->fault = fault;
             }
-        } else {
+
             (this->*doDescriptor)();
         }
     } else {
-        if (isTiming) {
-            port->sendTimingReq(descAddr, numBytes, data, flags,
+        RequestPtr req = std::make_shared<Request>(
+            desc_addr, num_bytes, flags, requestorId);
+        req->taskId(context_switch_task_id::DMA);
+
+        mpamTagTableWalk(req);
+
+        Fault fault = testWalk(req, descriptor.domain(),
+            lookup_level);
+
+        if (fault != NoFault) {
+            currState->fault = fault;
+            return;
+        }
+
+        if (currState->timing) {
+            port->sendTimingReq(req, data,
                 currState->tc->getCpuPtr()->clockPeriod(), event);
 
-            if (queueIndex >= 0) {
-                DPRINTF(PageTableWalker, "Adding to walker fifo: "
-                        "queue size before adding: %d\n",
-                        stateQueues[queueIndex].size());
-                stateQueues[queueIndex].push_back(currState);
-                currState = NULL;
-            }
         } else if (!currState->functional) {
-            port->sendAtomicReq(descAddr, numBytes, data, flags,
+            port->sendAtomicReq(req, data,
                 currState->tc->getCpuPtr()->clockPeriod());
 
             (this->*doDescriptor)();
         } else {
-            port->sendFunctionalReq(descAddr, numBytes, data, flags);
+            port->sendFunctionalReq(req, data);
             (this->*doDescriptor)();
         }
     }
-    return (isTiming);
+}
+
+void
+TableWalker::stashCurrState(int queue_idx)
+{
+    DPRINTF(PageTableWalker, "Adding to walker fifo: "
+            "queue size before adding: %d\n",
+            stateQueues[queue_idx].size());
+    stateQueues[queue_idx].push_back(currState);
+    currState = NULL;
 }
 
 void
@@ -2299,9 +2211,7 @@ TableWalker::insertPartialTableEntry(LongDescriptor &descriptor)
     te.partial        = true;
     // The entry is global if there is no address space identifier
     // to differentiate translation contexts
-    te.global         = !mmu->hasUnprivRegime(
-        currState->el, currState->hcr.e2h);
-    te.isHyp          = currState->isHyp;
+    te.global         = !mmu->hasUnprivRegime(currState->regime);
     te.asid           = currState->asid;
     te.vmid           = currState->vmid;
     te.N              = descriptor.offsetBits();
@@ -2315,22 +2225,22 @@ TableWalker::insertPartialTableEntry(LongDescriptor &descriptor)
     te.nstid          = !currState->isSecure;
     te.type           = TypeTLB::unified;
 
-    if (currState->aarch64)
-        te.el         = currState->el;
-    else
-        te.el         = EL1;
+    te.regime = currState->regime;
 
-    te.xn = currState->xnTable;
-    te.pxn = currState->pxnTable;
-    te.ap = (currState->rwTable << 1) | (currState->userTable);
+    te.xn = currState->longDescData->xnTable;
+    te.pxn = currState->longDescData->pxnTable;
+    te.ap = (currState->longDescData->rwTable << 1) |
+            (currState->longDescData->userTable);
+
+    memAttrsWalkAArch64(te);
 
     // Debug output
     DPRINTF(TLB, descriptor.dbgHeader().c_str());
     DPRINTF(TLB, " - N:%d pfn:%#x size:%#x global:%d valid:%d\n",
             te.N, te.pfn, te.size, te.global, te.valid);
     DPRINTF(TLB, " - vpn:%#x xn:%d pxn:%d ap:%d domain:%d asid:%d "
-            "vmid:%d hyp:%d nc:%d ns:%d\n", te.vpn, te.xn, te.pxn,
-            te.ap, static_cast<uint8_t>(te.domain), te.asid, te.vmid, te.isHyp,
+            "vmid:%d nc:%d ns:%d\n", te.vpn, te.xn, te.pxn,
+            te.ap, static_cast<uint8_t>(te.domain), te.asid, te.vmid,
             te.nonCacheable, te.ns);
     DPRINTF(TLB, " - domain from L%d desc:%d data:%#x\n",
             descriptor.lookupLevel, static_cast<uint8_t>(descriptor.domain()),
@@ -2349,7 +2259,6 @@ TableWalker::insertTableEntry(DescriptorBase &descriptor, bool long_descriptor)
     // Create and fill a new page table entry
     te.valid          = true;
     te.longDescFormat = long_descriptor;
-    te.isHyp          = currState->isHyp;
     te.asid           = currState->asid;
     te.vmid           = currState->vmid;
     te.N              = descriptor.offsetBits();
@@ -2364,10 +2273,7 @@ TableWalker::insertTableEntry(DescriptorBase &descriptor, bool long_descriptor)
     te.type           = currState->mode == BaseMMU::Execute ?
         TypeTLB::instruction : TypeTLB::data;
 
-    if (currState->aarch64)
-        te.el         = currState->el;
-    else
-        te.el         = EL1;
+    te.regime = currState->regime;
 
     stats.pageSizes[pageSizeNtoStatBin(te.N)]++;
     stats.requestOrigin[COMPLETED][currState->isFetch]++;
@@ -2380,15 +2286,16 @@ TableWalker::insertTableEntry(DescriptorBase &descriptor, bool long_descriptor)
             dynamic_cast<LongDescriptor &>(descriptor);
 
         te.tg = l_descriptor.grainSize;
-        te.xn |= currState->xnTable;
-        te.pxn = currState->pxnTable || l_descriptor.pxn();
+        te.xn |= currState->longDescData->xnTable;
+        te.pxn = currState->longDescData->pxnTable || l_descriptor.pxn();
         if (isStage2) {
             // this is actually the HAP field, but its stored in the same bit
             // possitions as the AP field in a stage 1 translation.
             te.hap = l_descriptor.ap();
         } else {
-           te.ap = ((!currState->rwTable || descriptor.ap() >> 1) << 1) |
-               (currState->userTable && (descriptor.ap() & 0x1));
+           te.ap = ((!currState->longDescData->rwTable ||
+                     descriptor.ap() >> 1) << 1) |
+               (currState->longDescData->userTable && (descriptor.ap() & 0x1));
         }
         if (currState->aarch64)
             memAttrsAArch64(currState->tc, te, l_descriptor);
@@ -2405,8 +2312,8 @@ TableWalker::insertTableEntry(DescriptorBase &descriptor, bool long_descriptor)
     DPRINTF(TLB, " - N:%d pfn:%#x size:%#x global:%d valid:%d\n",
             te.N, te.pfn, te.size, te.global, te.valid);
     DPRINTF(TLB, " - vpn:%#x xn:%d pxn:%d ap:%d domain:%d asid:%d "
-            "vmid:%d hyp:%d nc:%d ns:%d\n", te.vpn, te.xn, te.pxn,
-            te.ap, static_cast<uint8_t>(te.domain), te.asid, te.vmid, te.isHyp,
+            "vmid:%d nc:%d ns:%d\n", te.vpn, te.xn, te.pxn,
+            te.ap, static_cast<uint8_t>(te.domain), te.asid, te.vmid,
             te.nonCacheable, te.ns);
     DPRINTF(TLB, " - domain from L%d desc:%d data:%#x\n",
             descriptor.lookupLevel, static_cast<uint8_t>(descriptor.domain()),
@@ -2454,13 +2361,23 @@ TableWalker::pendingChange()
 }
 
 Fault
-TableWalker::testWalk(Addr pa, Addr size, TlbEntry::DomainType domain,
-                      LookupLevel lookup_level, bool stage2)
+TableWalker::testWalk(const RequestPtr &walk_req, TlbEntry::DomainType domain,
+                      LookupLevel lookup_level)
 {
-    return mmu->testWalk(pa, size, currState->vaddr, currState->isSecure,
-                         currState->mode, domain, lookup_level, stage2);
+    if (!test) {
+        return NoFault;
+    } else {
+        return test->walkCheck(walk_req, currState->vaddr, currState->isSecure,
+                               currState->el != EL0,
+                               currState->mode, domain, lookup_level);
+    }
 }
 
+void
+TableWalker::setTestInterface(TlbTestInterface *ti)
+{
+    test = ti;
+}
 
 uint8_t
 TableWalker::pageSizeNtoStatBin(uint8_t N)
@@ -2525,6 +2442,12 @@ TableWalker::readDataUntimed(ThreadContext *tc, Addr vaddr, Addr desc_addr,
     return fault;
 }
 
+void
+TableWalker::mpamTagTableWalk(RequestPtr &req) const
+{
+    mpam::tagRequest(currState->tc, req, currState->isFetch);
+}
+
 void
 TableWalker::readDataTimed(ThreadContext *tc, Addr desc_addr,
                            Stage2Walk *translation, int num_bytes,
@@ -2561,8 +2484,7 @@ TableWalker::Stage2Walk::finish(const Fault &_fault,
     }
 
     if (_fault == NoFault && !req->getFlags().isSet(Request::NO_ACCESS)) {
-        parent.getTableWalkerPort().sendTimingReq(
-            req->getPaddr(), numBytes, data, req->getFlags(),
+        parent.getTableWalkerPort().sendTimingReq(req, data,
             tc->getCpuPtr()->clockPeriod(), event);
     } else {
         // We can't do the DMA access as there's been a problem, so tell the
diff --git a/src/arch/arm/table_walker.hh b/src/arch/arm/table_walker.hh
index b511fd44d0..efd9e92d50 100644
--- a/src/arch/arm/table_walker.hh
+++ b/src/arch/arm/table_walker.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2016, 2019, 2021-2022 Arm Limited
+ * Copyright (c) 2010-2016, 2019, 2021-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -86,6 +86,7 @@ class TableWalker : public ClockedObject
         virtual uint8_t offsetBits() const = 0;
         virtual bool secure(bool have_security, WalkerState *currState) const = 0;
         virtual std::string dbgHeader() const = 0;
+        virtual uint8_t* getRawPtr() = 0;
         virtual uint64_t getRawData() const = 0;
         virtual uint8_t texcb() const
         {
@@ -122,6 +123,12 @@ class TableWalker : public ClockedObject
             lookupLevel = LookupLevel::L1;
         }
 
+        uint8_t*
+        getRawPtr() override
+        {
+            return reinterpret_cast<uint8_t*>(&data);
+        }
+
         uint64_t
         getRawData() const override
         {
@@ -291,6 +298,12 @@ class TableWalker : public ClockedObject
             lookupLevel = LookupLevel::L2;
         }
 
+        uint8_t*
+        getRawPtr() override
+        {
+            return reinterpret_cast<uint8_t*>(&data);
+        }
+
         uint64_t
         getRawData() const override
         {
@@ -441,6 +454,12 @@ class TableWalker : public ClockedObject
 
         uint8_t physAddrRange;
 
+        uint8_t*
+        getRawPtr() override
+        {
+            return reinterpret_cast<uint8_t*>(&data);
+        }
+
         uint64_t
         getRawData() const override
         {
@@ -647,7 +666,7 @@ class TableWalker : public ClockedObject
                                         !currState->secureLookup)) {
                 return false;  // ARM ARM issue C B3.6.3
             } else if (currState->aarch64) {
-                if (!MMU::hasUnprivRegime(currState->el, currState->hcr.e2h)) {
+                if (!MMU::hasUnprivRegime(currState->regime)) {
                     // By default translations are treated as global
                     // in AArch64 for regimes without an unpriviledged
                     // component
@@ -810,6 +829,9 @@ class TableWalker : public ClockedObject
         /** Current exception level */
         ExceptionLevel el;
 
+        /** Current translation regime */
+        TranslationRegime regime;
+
         /** Current physical address range in bits */
         int physAddrRange;
 
@@ -822,7 +844,6 @@ class TableWalker : public ClockedObject
         /** ASID that we're servicing the request under */
         uint16_t asid;
         vmid_t vmid;
-        bool    isHyp;
 
         /** Translation state for delayed requests */
         BaseMMU::Translation *transState;
@@ -869,21 +890,32 @@ class TableWalker : public ClockedObject
 
         /** If the access comes from the secure state. */
         bool isSecure;
+        /** Whether lookups should be treated as using the secure state.
+         * This is usually the same as isSecure, but can be set to false by the
+         * long descriptor table attributes. */
+        bool secureLookup = false;
 
         /** True if table walks are uncacheable (for table descriptors) */
         bool isUncacheable;
 
         /** Helper variables used to implement hierarchical access permissions
-         * when the long-desc. format is used (LPAE only) */
-        bool secureLookup;
-        bool rwTable;
-        bool userTable;
-        bool xnTable;
-        bool pxnTable;
+         * when the long-desc. format is used. */
+        struct LongDescData
+        {
+            bool rwTable = false;
+            bool userTable = false;
+            bool xnTable = false;
+            bool pxnTable = false;
+        };
+        std::optional<LongDescData> longDescData;
 
         /** Hierarchical access permission disable */
         bool hpd;
 
+        uint8_t sh;
+        uint8_t irgn;
+        uint8_t orgn;
+
         /** Flag indicating if a second stage of lookup is required */
         bool stage2Req;
 
@@ -941,14 +973,11 @@ class TableWalker : public ClockedObject
     class Port : public QueuedRequestPort
     {
       public:
-        Port(TableWalker& _walker, RequestorID id);
+        Port(TableWalker& _walker);
 
-        void sendFunctionalReq(Addr desc_addr, int size,
-            uint8_t *data, Request::Flags flag);
-        void sendAtomicReq(Addr desc_addr, int size,
-            uint8_t *data, Request::Flags flag, Tick delay);
-        void sendTimingReq(Addr desc_addr, int size,
-            uint8_t *data, Request::Flags flag, Tick delay,
+        void sendFunctionalReq(const RequestPtr &req, uint8_t *data);
+        void sendAtomicReq(const RequestPtr &req, uint8_t *data, Tick delay);
+        void sendTimingReq(const RequestPtr &req, uint8_t *data, Tick delay,
             Event *event);
 
         bool recvTimingResp(PacketPtr pkt) override;
@@ -958,8 +987,7 @@ class TableWalker : public ClockedObject
         void handleResp(TableWalkerState *state, Addr addr,
                         Addr size, Tick delay=0);
 
-        PacketPtr createPacket(Addr desc_addr, int size,
-                               uint8_t *data, Request::Flags flag,
+        PacketPtr createPacket(const RequestPtr &req, uint8_t *data,
                                Tick delay, Event *event);
 
       private:
@@ -970,9 +998,6 @@ class TableWalker : public ClockedObject
 
         /** Packet queue used to store outgoing snoop responses. */
         SnoopRespPacketQueue snoopRespQueue;
-
-        /** Cached requestorId of the table walker */
-        RequestorID requestorId;
     };
 
     /** This translation class is used to trigger the data fetch once a timing
@@ -1105,7 +1130,7 @@ class TableWalker : public ClockedObject
 
     Fault walk(const RequestPtr &req, ThreadContext *tc,
                uint16_t asid, vmid_t _vmid,
-               bool hyp, BaseMMU::Mode mode, BaseMMU::Translation *_trans,
+               BaseMMU::Mode mode, BaseMMU::Translation *_trans,
                bool timing, bool functional, bool secure,
                MMU::ArmTranslationType tran_type, bool stage2,
                const TlbEntry *walk_entry);
@@ -1119,6 +1144,7 @@ class TableWalker : public ClockedObject
                       LongDescriptor &lDescriptor);
     void memAttrsAArch64(ThreadContext *tc, TlbEntry &te,
                          LongDescriptor &lDescriptor);
+    void memAttrsWalkAArch64(TlbEntry &te);
 
     static LookupLevel toLookupLevel(uint8_t lookup_level_as_int);
 
@@ -1146,8 +1172,9 @@ class TableWalker : public ClockedObject
     void doLongDescriptorWrapper(LookupLevel curr_lookup_level);
     Event* LongDescEventByLevel[4];
 
-    bool fetchDescriptor(Addr descAddr, uint8_t *data, int numBytes,
-        Request::Flags flags, int queueIndex, Event *event,
+    void fetchDescriptor(Addr desc_addr,
+        DescriptorBase &descriptor, int num_bytes,
+        Request::Flags flags, LookupLevel lookup_lvl, Event *event,
         void (TableWalker::*doDescriptor)());
 
     Fault generateLongDescFault(ArmFault::FaultSource src);
@@ -1173,6 +1200,9 @@ class TableWalker : public ClockedObject
     /// system-wide setting or by the TCR_ELx IPS/PS setting
     bool checkAddrSizeFaultAArch64(Addr addr, int pa_range);
 
+    /// Returns true if the table walk should be uncacheable
+    bool uncacheableWalk() const;
+
     Fault processWalkAArch64();
     void processWalkWrapper();
     EventFunctionWrapper doProcessEvent;
@@ -1181,10 +1211,20 @@ class TableWalker : public ClockedObject
 
     void pendingChange();
 
+    /** Timing mode: saves the currState into the stateQueues */
+    void stashCurrState(int queue_idx);
+
     static uint8_t pageSizeNtoStatBin(uint8_t N);
 
-    Fault testWalk(Addr pa, Addr size, TlbEntry::DomainType domain,
-                   LookupLevel lookup_level, bool stage2);
+    void mpamTagTableWalk(RequestPtr &req) const;
+
+  public: /* Testing */
+    TlbTestInterface *test;
+
+    void setTestInterface(TlbTestInterface *ti);
+
+    Fault testWalk(const RequestPtr &walk_req, TlbEntry::DomainType domain,
+                   LookupLevel lookup_level);
 };
 
 } // namespace ArmISA
diff --git a/src/arch/arm/tlb.cc b/src/arch/arm/tlb.cc
index e2979f5c7c..4bceb28383 100644
--- a/src/arch/arm/tlb.cc
+++ b/src/arch/arm/tlb.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013, 2016-2022 Arm Limited
+ * Copyright (c) 2010-2013, 2016-2023 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -160,17 +160,17 @@ TLB::lookup(const Lookup &lookup_data)
 
     TlbEntry *retval = match(lookup_data);
 
-    DPRINTF(TLBVerbose, "Lookup %#x, asn %#x -> %s vmn 0x%x hyp %d secure %d "
+    DPRINTF(TLBVerbose, "Lookup %#x, asn %#x -> %s vmn 0x%x secure %d "
             "ppn %#x size: %#x pa: %#x ap:%d ns:%d nstid:%d g:%d asid: %d "
-            "el: %d\n",
+            "regime: %s\n",
             lookup_data.va, lookup_data.asn, retval ? "hit" : "miss",
-            lookup_data.vmid, lookup_data.hyp, lookup_data.secure,
+            lookup_data.vmid, lookup_data.secure,
             retval ? retval->pfn       : 0, retval ? retval->size  : 0,
             retval ? retval->pAddr(lookup_data.va) : 0,
             retval ? retval->ap        : 0,
             retval ? retval->ns        : 0, retval ? retval->nstid : 0,
             retval ? retval->global    : 0, retval ? retval->asid  : 0,
-            retval ? retval->el        : 0);
+            retval ? regimeToStr(retval->regime) : 0);
 
     // Updating stats if this was not a functional lookup
     if (!lookup_data.functional) {
@@ -242,20 +242,20 @@ TLB::insert(TlbEntry &entry)
 {
     DPRINTF(TLB, "Inserting entry into TLB with pfn:%#x size:%#x vpn: %#x"
             " asid:%d vmid:%d N:%d global:%d valid:%d nc:%d xn:%d"
-            " ap:%#x domain:%#x ns:%d nstid:%d isHyp:%d\n", entry.pfn,
+            " ap:%#x domain:%#x ns:%d nstid:%d, regime: %s\n", entry.pfn,
             entry.size, entry.vpn, entry.asid, entry.vmid, entry.N,
             entry.global, entry.valid, entry.nonCacheable, entry.xn,
-            entry.ap, static_cast<uint8_t>(entry.domain), entry.ns, entry.nstid,
-            entry.isHyp);
+            entry.ap, static_cast<uint8_t>(entry.domain), entry.ns,
+            entry.nstid, regimeToStr(entry.regime));
 
     if (table[size - 1].valid)
         DPRINTF(TLB, " - Replacing Valid entry %#x, asn %d vmn %d ppn %#x "
-                "size: %#x ap:%d ns:%d nstid:%d g:%d isHyp:%d el: %d\n",
+                "size: %#x ap:%d ns:%d nstid:%d g:%d regime: %s\n",
                 table[size-1].vpn << table[size-1].N, table[size-1].asid,
                 table[size-1].vmid, table[size-1].pfn << table[size-1].N,
                 table[size-1].size, table[size-1].ap, table[size-1].ns,
-                table[size-1].nstid, table[size-1].global, table[size-1].isHyp,
-                table[size-1].el);
+                table[size-1].nstid, table[size-1].global,
+                regimeToStr(table[size-1].regime));
 
     // inserting to MRU position and evicting the LRU one
     for (int i = size - 1; i > 0; --i)
diff --git a/src/arch/arm/tlb.hh b/src/arch/arm/tlb.hh
index fc9b68f0be..01d945d5dc 100644
--- a/src/arch/arm/tlb.hh
+++ b/src/arch/arm/tlb.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013, 2016, 2019-2022 Arm Limited
+ * Copyright (c) 2010-2013, 2016, 2019-2022, 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -84,8 +84,7 @@ class TlbTestInterface
     /**
      * Check if a page table walker access should be forced to fail.
      *
-     * @param pa Physical address the walker is accessing
-     * @param size Walker access size
+     * @param req walk request bearing a valid phys address
      * @param va Virtual address that initiated the walk
      * @param is_secure Access from secure state
      * @param is_priv Access from a privileged mode (i.e., not EL0)
@@ -93,7 +92,8 @@ class TlbTestInterface
      * @param domain Domain type
      * @param lookup_level Page table walker level
      */
-    virtual Fault walkCheck(Addr pa, Addr size, Addr va, bool is_secure,
+    virtual Fault walkCheck(const RequestPtr &walk_req,
+                            Addr va, bool is_secure,
                             Addr is_priv, BaseMMU::Mode mode,
                             TlbEntry::DomainType domain,
                             enums::ArmLookupLevel lookup_level) = 0;
@@ -207,14 +207,6 @@ class TLB : public BaseTLB
      */
     void flush(const TLBIOp &tlbi_op);
 
-    Fault trickBoxCheck(const RequestPtr &req, BaseMMU::Mode mode,
-                        TlbEntry::DomainType domain);
-
-    Fault walkTrickBoxCheck(Addr pa, bool is_secure, Addr va, Addr sz,
-                            bool is_exec, bool is_write,
-                            TlbEntry::DomainType domain,
-                            LookupLevel lookup_level);
-
     void printTlb() const;
 
     void demapPage(Addr vaddr, uint64_t asn) override
diff --git a/src/arch/arm/tlbi_op.cc b/src/arch/arm/tlbi_op.cc
index b49139bf3e..1080a64e4b 100644
--- a/src/arch/arm/tlbi_op.cc
+++ b/src/arch/arm/tlbi_op.cc
@@ -48,8 +48,6 @@ namespace ArmISA {
 void
 TLBIALL::operator()(ThreadContext* tc)
 {
-    HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-    inHost = (hcr.tge == 1 && hcr.e2h == 1);
     el2Enabled = EL2Enabled(tc);
     currentEL = currEL(tc);
 
@@ -67,7 +65,7 @@ TLBIALL::match(TlbEntry* te, vmid_t vmid) const
 {
     return te->valid && secureLookup == !te->nstid &&
         (te->vmid == vmid || el2Enabled) &&
-        te->checkELMatch(targetEL, inHost);
+        te->checkRegime(targetRegime);
 }
 
 void
@@ -99,8 +97,6 @@ DTLBIALL::match(TlbEntry* te, vmid_t vmid) const
 void
 TLBIALLEL::operator()(ThreadContext* tc)
 {
-    HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-    inHost = (hcr.tge == 1 && hcr.e2h == 1);
     getMMUPtr(tc)->flush(*this);
 
     // If CheckerCPU is connected, need to notify it of a flush
@@ -114,14 +110,12 @@ bool
 TLBIALLEL::match(TlbEntry* te, vmid_t vmid) const
 {
     return te->valid && secureLookup == !te->nstid &&
-        te->checkELMatch(targetEL, inHost);
+        te->checkRegime(targetRegime);
 }
 
 void
 TLBIVMALL::operator()(ThreadContext* tc)
 {
-    HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-    inHost = (hcr.tge == 1 && hcr.e2h == 1);
     el2Enabled = EL2Enabled(tc);
 
     getMMUPtr(tc)->flush(*this);
@@ -137,15 +131,13 @@ bool
 TLBIVMALL::match(TlbEntry* te, vmid_t vmid) const
 {
     return te->valid && secureLookup == !te->nstid &&
-        te->checkELMatch(targetEL, inHost) &&
-        (te->vmid == vmid || !el2Enabled || (!stage2Flush() && inHost));
+        te->checkRegime(targetRegime) &&
+        (te->vmid == vmid || !el2Enabled || !useVMID(targetRegime));
 }
 
 void
 TLBIASID::operator()(ThreadContext* tc)
 {
-    HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-    inHost = (hcr.tge == 1 && hcr.e2h == 1);
     el2Enabled = EL2Enabled(tc);
 
     getMMUPtr(tc)->flushStage1(*this);
@@ -160,8 +152,8 @@ TLBIASID::match(TlbEntry* te, vmid_t vmid) const
 {
     return te->valid && te->asid == asid &&
         secureLookup == !te->nstid &&
-        te->checkELMatch(targetEL, inHost) &&
-        (te->vmid == vmid || !el2Enabled || inHost);
+        te->checkRegime(targetRegime) &&
+        (te->vmid == vmid || !el2Enabled || !useVMID(targetRegime));
 }
 
 void
@@ -205,8 +197,7 @@ bool
 TLBIALLN::match(TlbEntry* te, vmid_t vmid) const
 {
     return te->valid && te->nstid &&
-        te->isHyp == (targetEL == EL2) &&
-        te->checkELMatch(targetEL, false);
+        te->checkRegime(targetRegime);
 }
 
 TlbEntry::Lookup
@@ -216,11 +207,9 @@ TLBIMVAA::lookupGen(vmid_t vmid) const
     lookup_data.va = sext<56>(addr);
     lookup_data.ignoreAsn = true;
     lookup_data.vmid = vmid;
-    lookup_data.hyp = targetEL == EL2;
     lookup_data.secure = secureLookup;
     lookup_data.functional = true;
-    lookup_data.targetEL = targetEL;
-    lookup_data.inHost = inHost;
+    lookup_data.targetRegime = targetRegime;
     lookup_data.mode = BaseMMU::Read;
     return lookup_data;
 }
@@ -228,8 +217,6 @@ TLBIMVAA::lookupGen(vmid_t vmid) const
 void
 TLBIMVAA::operator()(ThreadContext* tc)
 {
-    HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-    inHost = (hcr.tge == 1 && hcr.e2h == 1);
     getMMUPtr(tc)->flushStage1(*this);
 
     CheckerCPU *checker = tc->getCheckerCpuPtr();
@@ -254,11 +241,9 @@ TLBIMVA::lookupGen(vmid_t vmid) const
     lookup_data.asn = asid;
     lookup_data.ignoreAsn = false;
     lookup_data.vmid = vmid;
-    lookup_data.hyp = targetEL == EL2;
     lookup_data.secure = secureLookup;
     lookup_data.functional = true;
-    lookup_data.targetEL = targetEL;
-    lookup_data.inHost = inHost;
+    lookup_data.targetRegime = targetRegime;
     lookup_data.mode = BaseMMU::Read;
 
     return lookup_data;
@@ -267,8 +252,6 @@ TLBIMVA::lookupGen(vmid_t vmid) const
 void
 TLBIMVA::operator()(ThreadContext* tc)
 {
-    HCR hcr = tc->readMiscReg(MISCREG_HCR_EL2);
-    inHost = (hcr.tge == 1 && hcr.e2h == 1);
     getMMUPtr(tc)->flushStage1(*this);
 
     CheckerCPU *checker = tc->getCheckerCpuPtr();
diff --git a/src/arch/arm/tlbi_op.hh b/src/arch/arm/tlbi_op.hh
index 38e8252869..26105abfaf 100644
--- a/src/arch/arm/tlbi_op.hh
+++ b/src/arch/arm/tlbi_op.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, 2022-2023 Arm Limited
+ * Copyright (c) 2018-2020, 2022-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -57,8 +57,8 @@ namespace ArmISA {
 class TLBIOp
 {
   public:
-    TLBIOp(ExceptionLevel _targetEL, bool _secure)
-      : secureLookup(_secure), targetEL(_targetEL)
+    TLBIOp(TranslationRegime _target_regime, bool _secure)
+      : secureLookup(_secure), targetRegime(_target_regime)
     {}
 
     virtual ~TLBIOp() {}
@@ -101,15 +101,15 @@ class TLBIOp
     }
 
     bool secureLookup;
-    ExceptionLevel targetEL;
+    TranslationRegime targetRegime;
 };
 
 /** TLB Invalidate All */
 class TLBIALL : public TLBIOp
 {
   public:
-    TLBIALL(ExceptionLevel _targetEL, bool _secure)
-      : TLBIOp(_targetEL, _secure), inHost(false), el2Enabled(false),
+    TLBIALL(TranslationRegime _target_regime, bool _secure)
+      : TLBIOp(_target_regime, _secure), el2Enabled(false),
         currentEL(EL0)
     {}
 
@@ -128,10 +128,9 @@ class TLBIALL : public TLBIOp
     TLBIALL
     makeStage2() const
     {
-        return TLBIALL(EL1, secureLookup);
+        return TLBIALL(targetRegime, secureLookup);
     }
 
-    bool inHost;
     bool el2Enabled;
     ExceptionLevel currentEL;
 };
@@ -140,12 +139,10 @@ class TLBIALL : public TLBIOp
 class ITLBIALL : public TLBIALL
 {
   public:
-    ITLBIALL(ExceptionLevel _targetEL, bool _secure)
-      : TLBIALL(_targetEL, _secure)
+    ITLBIALL(TranslationRegime _target_regime, bool _secure)
+      : TLBIALL(_target_regime, _secure)
     {}
 
-    void broadcast(ThreadContext *tc) = delete;
-
     void operator()(ThreadContext* tc) override;
 
     bool match(TlbEntry *entry, vmid_t curr_vmid) const override;
@@ -155,12 +152,10 @@ class ITLBIALL : public TLBIALL
 class DTLBIALL : public TLBIALL
 {
   public:
-    DTLBIALL(ExceptionLevel _targetEL, bool _secure)
-      : TLBIALL(_targetEL, _secure)
+    DTLBIALL(TranslationRegime _target_regime, bool _secure)
+      : TLBIALL(_target_regime, _secure)
     {}
 
-    void broadcast(ThreadContext *tc) = delete;
-
     void operator()(ThreadContext* tc) override;
 
     bool match(TlbEntry *entry, vmid_t curr_vmid) const override;
@@ -170,8 +165,8 @@ class DTLBIALL : public TLBIALL
 class TLBIALLEL : public TLBIOp
 {
   public:
-    TLBIALLEL(ExceptionLevel _targetEL, bool _secure)
-      : TLBIOp(_targetEL, _secure), inHost(false)
+    TLBIALLEL(TranslationRegime _target_regime, bool _secure)
+      : TLBIOp(_target_regime, _secure)
     {}
 
     void operator()(ThreadContext* tc) override;
@@ -182,24 +177,24 @@ class TLBIALLEL : public TLBIOp
     stage2Flush() const override
     {
         // If we're targeting EL1 then flush stage2 as well
-        return targetEL == EL1;
+        return targetRegime == TranslationRegime::EL10 ||
+               targetRegime == TranslationRegime::EL20;
     }
 
     TLBIALLEL
     makeStage2() const
     {
-        return TLBIALLEL(EL1, secureLookup);
+        return TLBIALLEL(targetRegime, secureLookup);
     }
 
-    bool inHost;
 };
 
 /** Implementaton of AArch64 TLBI VMALLE1(IS)/VMALLS112E1(IS) instructions */
 class TLBIVMALL : public TLBIOp
 {
   public:
-    TLBIVMALL(ExceptionLevel _targetEL, bool _secure, bool _stage2)
-      : TLBIOp(_targetEL, _secure), inHost(false), el2Enabled(false),
+    TLBIVMALL(TranslationRegime _target_regime, bool _secure, bool _stage2)
+      : TLBIOp(_target_regime, _secure), el2Enabled(false),
         stage2(_stage2)
     {}
 
@@ -216,10 +211,9 @@ class TLBIVMALL : public TLBIOp
     TLBIVMALL
     makeStage2() const
     {
-        return TLBIVMALL(EL1, secureLookup, false);
+        return TLBIVMALL(targetRegime, secureLookup, false);
     }
 
-    bool inHost;
     bool el2Enabled;
     bool stage2;
 };
@@ -228,8 +222,8 @@ class TLBIVMALL : public TLBIOp
 class TLBIASID : public TLBIOp
 {
   public:
-    TLBIASID(ExceptionLevel _targetEL, bool _secure, uint16_t _asid)
-      : TLBIOp(_targetEL, _secure), asid(_asid), inHost(false),
+    TLBIASID(TranslationRegime _target_regime, bool _secure, uint16_t _asid)
+      : TLBIOp(_target_regime, _secure), asid(_asid),
         el2Enabled(false)
     {}
 
@@ -238,7 +232,6 @@ class TLBIASID : public TLBIOp
     bool match(TlbEntry *entry, vmid_t curr_vmid) const override;
 
     uint16_t asid;
-    bool inHost;
     bool el2Enabled;
 };
 
@@ -246,12 +239,10 @@ class TLBIASID : public TLBIOp
 class ITLBIASID : public TLBIASID
 {
   public:
-    ITLBIASID(ExceptionLevel _targetEL, bool _secure, uint16_t _asid)
-      : TLBIASID(_targetEL, _secure, _asid)
+    ITLBIASID(TranslationRegime _target_regime, bool _secure, uint16_t _asid)
+      : TLBIASID(_target_regime, _secure, _asid)
     {}
 
-    void broadcast(ThreadContext *tc) = delete;
-
     void operator()(ThreadContext* tc) override;
 
     bool match(TlbEntry *entry, vmid_t curr_vmid) const override;
@@ -261,12 +252,10 @@ class ITLBIASID : public TLBIASID
 class DTLBIASID : public TLBIASID
 {
   public:
-    DTLBIASID(ExceptionLevel _targetEL, bool _secure, uint16_t _asid)
-      : TLBIASID(_targetEL, _secure, _asid)
+    DTLBIASID(TranslationRegime _target_regime, bool _secure, uint16_t _asid)
+      : TLBIASID(_target_regime, _secure, _asid)
     {}
 
-    void broadcast(ThreadContext *tc) = delete;
-
     void operator()(ThreadContext* tc) override;
 
     bool match(TlbEntry *entry, vmid_t curr_vmid) const override;
@@ -276,8 +265,8 @@ class DTLBIASID : public TLBIASID
 class TLBIALLN : public TLBIOp
 {
   public:
-    TLBIALLN(ExceptionLevel _targetEL)
-      : TLBIOp(_targetEL, false)
+    TLBIALLN(TranslationRegime _target_regime)
+      : TLBIOp(_target_regime, false)
     {}
 
     void operator()(ThreadContext* tc) override;
@@ -287,13 +276,13 @@ class TLBIALLN : public TLBIOp
     bool
     stage2Flush() const override
     {
-        return targetEL != EL2;
+        return targetRegime != TranslationRegime::EL2;
     }
 
     TLBIALLN
     makeStage2() const
     {
-        return TLBIALLN(EL1);
+        return TLBIALLN(targetRegime);
     }
 };
 
@@ -303,9 +292,9 @@ class TLBIMVAA : public TLBIOp
   protected:
     TlbEntry::Lookup lookupGen(vmid_t vmid) const;
   public:
-    TLBIMVAA(ExceptionLevel _targetEL, bool _secure,
+    TLBIMVAA(TranslationRegime _target_regime, bool _secure,
              Addr _addr, bool last_level)
-      : TLBIOp(_targetEL, _secure), addr(_addr), inHost(false),
+      : TLBIOp(_target_regime, _secure), addr(_addr),
         lastLevel(last_level)
     {}
 
@@ -314,7 +303,6 @@ class TLBIMVAA : public TLBIOp
     bool match(TlbEntry *entry, vmid_t curr_vmid) const override;
 
     Addr addr;
-    bool inHost;
     bool lastLevel;
 };
 
@@ -325,10 +313,10 @@ class TLBIMVA : public TLBIOp
     TlbEntry::Lookup lookupGen(vmid_t vmid) const;
 
   public:
-    TLBIMVA(ExceptionLevel _targetEL, bool _secure,
+    TLBIMVA(TranslationRegime _target_regime, bool _secure,
             Addr _addr, uint16_t _asid, bool last_level)
-      : TLBIOp(_targetEL, _secure), addr(_addr), asid(_asid),
-        inHost(false), lastLevel(last_level)
+      : TLBIOp(_target_regime, _secure), addr(_addr), asid(_asid),
+        lastLevel(last_level)
     {}
 
     void operator()(ThreadContext* tc) override;
@@ -337,7 +325,6 @@ class TLBIMVA : public TLBIOp
 
     Addr addr;
     uint16_t asid;
-    bool inHost;
     bool lastLevel;
 };
 
@@ -345,13 +332,11 @@ class TLBIMVA : public TLBIOp
 class ITLBIMVA : public TLBIMVA
 {
   public:
-    ITLBIMVA(ExceptionLevel _targetEL, bool _secure,
+    ITLBIMVA(TranslationRegime _target_regime, bool _secure,
              Addr _addr, uint16_t _asid)
-      : TLBIMVA(_targetEL, _secure, _addr, _asid, false)
+      : TLBIMVA(_target_regime, _secure, _addr, _asid, false)
     {}
 
-    void broadcast(ThreadContext *tc) = delete;
-
     void operator()(ThreadContext* tc) override;
 
     bool match(TlbEntry *entry, vmid_t curr_vmid) const override;
@@ -361,13 +346,11 @@ class ITLBIMVA : public TLBIMVA
 class DTLBIMVA : public TLBIMVA
 {
   public:
-    DTLBIMVA(ExceptionLevel _targetEL, bool _secure,
+    DTLBIMVA(TranslationRegime _target_regime, bool _secure,
              Addr _addr, uint16_t _asid)
-      : TLBIMVA(_targetEL, _secure, _addr, _asid, false)
+      : TLBIMVA(_target_regime, _secure, _addr, _asid, false)
     {}
 
-    void broadcast(ThreadContext *tc) = delete;
-
     void operator()(ThreadContext* tc) override;
 
     bool match(TlbEntry *entry, vmid_t curr_vmid) const override;
@@ -432,9 +415,9 @@ class TLBIRange
 class TLBIIPA : public TLBIOp
 {
   public:
-    TLBIIPA(ExceptionLevel _targetEL, bool _secure, Addr _addr,
+    TLBIIPA(TranslationRegime _target_regime, bool _secure, Addr _addr,
             bool last_level)
-      : TLBIOp(_targetEL, _secure), addr(_addr), lastLevel(last_level)
+      : TLBIOp(_target_regime, _secure), addr(_addr), lastLevel(last_level)
     {}
 
     void operator()(ThreadContext* tc) override;
@@ -455,7 +438,7 @@ class TLBIIPA : public TLBIOp
     virtual TLBIMVAA
     makeStage2() const
     {
-        return TLBIMVAA(EL1, secureLookup, addr, lastLevel);
+        return TLBIMVAA(targetRegime, secureLookup, addr, lastLevel);
     }
 
     Addr addr;
@@ -466,10 +449,10 @@ class TLBIIPA : public TLBIOp
 class TLBIRMVA : public TLBIRange, public TLBIMVA
 {
   public:
-    TLBIRMVA(ExceptionLevel _targetEL, bool _secure,
+    TLBIRMVA(TranslationRegime _target_regime, bool _secure,
              RegVal val, uint16_t _asid, bool last_level)
       : TLBIRange(val),
-        TLBIMVA(_targetEL, _secure, startAddress(), _asid, last_level)
+        TLBIMVA(_target_regime, _secure, startAddress(), _asid, last_level)
     {}
 
     bool match(TlbEntry *entry, vmid_t curr_vmid) const override;
@@ -479,10 +462,10 @@ class TLBIRMVA : public TLBIRange, public TLBIMVA
 class TLBIRMVAA : public TLBIRange, public TLBIMVAA
 {
   public:
-    TLBIRMVAA(ExceptionLevel _targetEL, bool _secure,
+    TLBIRMVAA(TranslationRegime _target_regime, bool _secure,
               RegVal val, bool last_level)
       : TLBIRange(val),
-        TLBIMVAA(_targetEL, _secure, startAddress(), last_level)
+        TLBIMVAA(_target_regime, _secure, startAddress(), last_level)
     {}
 
     bool match(TlbEntry *entry, vmid_t curr_vmid) const override;
@@ -492,16 +475,16 @@ class TLBIRMVAA : public TLBIRange, public TLBIMVAA
 class TLBIRIPA : public TLBIRange, public TLBIIPA
 {
   public:
-    TLBIRIPA(ExceptionLevel _targetEL, bool _secure,
+    TLBIRIPA(TranslationRegime _target_regime, bool _secure,
              RegVal val, bool last_level)
       : TLBIRange(val),
-        TLBIIPA(_targetEL, _secure, startAddress(), last_level)
+        TLBIIPA(_target_regime, _secure, startAddress(), last_level)
     {}
 
     virtual TLBIMVAA
     makeStage2() const
     {
-        return TLBIRMVAA(EL1, secureLookup, rangeData, lastLevel);
+        return TLBIRMVAA(targetRegime, secureLookup, rangeData, lastLevel);
     }
 };
 
diff --git a/src/arch/arm/tracers/tarmac_parser.cc b/src/arch/arm/tracers/tarmac_parser.cc
index cb2d9e31c1..edb2d1685b 100644
--- a/src/arch/arm/tracers/tarmac_parser.cc
+++ b/src/arch/arm/tracers/tarmac_parser.cc
@@ -625,37 +625,37 @@ TarmacParserRecord::MiscRegMap TarmacParserRecord::miscRegMap = {
     { "at_s1e3r_xt", MISCREG_AT_S1E3R_Xt },
     { "at_s1e3w_xt", MISCREG_AT_S1E3W_Xt },
     { "tlbi_vmalle1is", MISCREG_TLBI_VMALLE1IS },
-    { "tlbi_vae1is_xt", MISCREG_TLBI_VAE1IS_Xt },
-    { "tlbi_aside1is_xt", MISCREG_TLBI_ASIDE1IS_Xt },
-    { "tlbi_vaae1is_xt", MISCREG_TLBI_VAAE1IS_Xt },
-    { "tlbi_vale1is_xt", MISCREG_TLBI_VALE1IS_Xt },
-    { "tlbi_vaale1is_xt", MISCREG_TLBI_VAALE1IS_Xt },
+    { "tlbi_vae1is", MISCREG_TLBI_VAE1IS },
+    { "tlbi_aside1is", MISCREG_TLBI_ASIDE1IS },
+    { "tlbi_vaae1is", MISCREG_TLBI_VAAE1IS },
+    { "tlbi_vale1is", MISCREG_TLBI_VALE1IS },
+    { "tlbi_vaale1is", MISCREG_TLBI_VAALE1IS },
     { "tlbi_vmalle1", MISCREG_TLBI_VMALLE1 },
-    { "tlbi_vae1_xt", MISCREG_TLBI_VAE1_Xt },
-    { "tlbi_aside1_xt", MISCREG_TLBI_ASIDE1_Xt },
-    { "tlbi_vaae1_xt", MISCREG_TLBI_VAAE1_Xt },
-    { "tlbi_vale1_xt", MISCREG_TLBI_VALE1_Xt },
-    { "tlbi_vaale1_xt", MISCREG_TLBI_VAALE1_Xt },
-    { "tlbi_ipas2e1is_xt", MISCREG_TLBI_IPAS2E1IS_Xt },
-    { "tlbi_ipas2le1is_xt", MISCREG_TLBI_IPAS2LE1IS_Xt },
+    { "tlbi_vae1", MISCREG_TLBI_VAE1 },
+    { "tlbi_aside1", MISCREG_TLBI_ASIDE1 },
+    { "tlbi_vaae1", MISCREG_TLBI_VAAE1 },
+    { "tlbi_vale1", MISCREG_TLBI_VALE1 },
+    { "tlbi_vaale1", MISCREG_TLBI_VAALE1 },
+    { "tlbi_ipas2e1is", MISCREG_TLBI_IPAS2E1IS },
+    { "tlbi_ipas2le1is", MISCREG_TLBI_IPAS2LE1IS },
     { "tlbi_alle2is", MISCREG_TLBI_ALLE2IS },
-    { "tlbi_vae2is_xt", MISCREG_TLBI_VAE2IS_Xt },
+    { "tlbi_vae2is", MISCREG_TLBI_VAE2IS },
     { "tlbi_alle1is", MISCREG_TLBI_ALLE1IS },
-    { "tlbi_vale2is_xt", MISCREG_TLBI_VALE2IS_Xt },
+    { "tlbi_vale2is", MISCREG_TLBI_VALE2IS },
     { "tlbi_vmalls12e1is", MISCREG_TLBI_VMALLS12E1IS },
-    { "tlbi_ipas2e1_xt", MISCREG_TLBI_IPAS2E1_Xt },
-    { "tlbi_ipas2le1_xt", MISCREG_TLBI_IPAS2LE1_Xt },
+    { "tlbi_ipas2e1", MISCREG_TLBI_IPAS2E1 },
+    { "tlbi_ipas2le1", MISCREG_TLBI_IPAS2LE1 },
     { "tlbi_alle2", MISCREG_TLBI_ALLE2 },
-    { "tlbi_vae2_xt", MISCREG_TLBI_VAE2_Xt },
+    { "tlbi_vae2", MISCREG_TLBI_VAE2 },
     { "tlbi_alle1", MISCREG_TLBI_ALLE1 },
-    { "tlbi_vale2_xt", MISCREG_TLBI_VALE2_Xt },
+    { "tlbi_vale2", MISCREG_TLBI_VALE2 },
     { "tlbi_vmalls12e1", MISCREG_TLBI_VMALLS12E1 },
     { "tlbi_alle3is", MISCREG_TLBI_ALLE3IS },
-    { "tlbi_vae3is_xt", MISCREG_TLBI_VAE3IS_Xt },
-    { "tlbi_vale3is_xt", MISCREG_TLBI_VALE3IS_Xt },
+    { "tlbi_vae3is", MISCREG_TLBI_VAE3IS },
+    { "tlbi_vale3is", MISCREG_TLBI_VALE3IS },
     { "tlbi_alle3", MISCREG_TLBI_ALLE3 },
-    { "tlbi_vae3_xt", MISCREG_TLBI_VAE3_Xt },
-    { "tlbi_vale3_xt", MISCREG_TLBI_VALE3_Xt },
+    { "tlbi_vae3", MISCREG_TLBI_VAE3 },
+    { "tlbi_vale3", MISCREG_TLBI_VALE3 },
     { "pmintenset_el1", MISCREG_PMINTENSET_EL1 },
     { "pmintenclr_el1", MISCREG_PMINTENCLR_EL1 },
     { "pmcr_el0", MISCREG_PMCR_EL0 },
diff --git a/src/arch/arm/types.hh b/src/arch/arm/types.hh
index f7b6cbf86b..8d12356ec5 100644
--- a/src/arch/arm/types.hh
+++ b/src/arch/arm/types.hh
@@ -276,6 +276,14 @@ namespace ArmISA
         EL3
     };
 
+    enum class TranslationRegime
+    {
+        EL10,
+        EL20,
+        EL2,
+        EL3
+    };
+
     enum OperatingMode
     {
         MODE_EL0T = 0x0,
@@ -462,6 +470,23 @@ namespace ArmISA
         }
     }
 
+    static inline const char*
+    regimeToStr(TranslationRegime regime)
+    {
+        switch (regime) {
+          case TranslationRegime::EL10:
+            return "EL10";
+          case TranslationRegime::EL20:
+            return "EL20";
+          case TranslationRegime::EL2:
+            return "EL2";
+          case TranslationRegime::EL3:
+            return "EL3";
+          default:
+            GEM5_UNREACHABLE;
+        }
+    }
+
     constexpr unsigned MaxSveVecLenInBits = 2048;
     static_assert(MaxSveVecLenInBits >= 128 &&
                   MaxSveVecLenInBits <= 2048 &&
diff --git a/src/arch/arm/utility.cc b/src/arch/arm/utility.cc
index 926a7e3343..2e1ad146d6 100644
--- a/src/arch/arm/utility.cc
+++ b/src/arch/arm/utility.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2014, 2016-2020, 2022-2023 Arm Limited
+ * Copyright (c) 2009-2014, 2016-2020, 2022-2024 Arm Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -89,6 +89,15 @@ isSecureBelowEL3(ThreadContext *tc)
         static_cast<SCR>(tc->readMiscRegNoEffect(MISCREG_SCR_EL3)).ns == 0;
 }
 
+bool
+isSecureAtEL(ThreadContext *tc, ExceptionLevel el)
+{
+    if (ArmSystem::haveEL(tc, EL3) && el == EL3)
+        return true;
+    else
+        return isSecureBelowEL3(tc);
+}
+
 ExceptionLevel
 debugTargetFrom(ThreadContext *tc, bool secure)
 {
@@ -1366,5 +1375,40 @@ isHcrxEL2Enabled(ThreadContext *tc)
     return EL2Enabled(tc);
 }
 
+TranslationRegime
+translationRegime(ThreadContext *tc, ExceptionLevel el)
+{
+    switch (el) {
+      case EL3:
+        return TranslationRegime::EL3;
+      case EL2:
+        return ELIsInHost(tc, EL2) ?
+            TranslationRegime::EL20 : TranslationRegime::EL2;
+      case EL1:
+        return TranslationRegime::EL10;
+      case EL0:
+        return ELIsInHost(tc, EL0) ?
+            TranslationRegime::EL20 : TranslationRegime::EL10;
+      default:
+        panic("Invalid ExceptionLevel\n");
+    }
+}
+
+ExceptionLevel
+translationEl(TranslationRegime regime)
+{
+    switch (regime) {
+      case TranslationRegime::EL10:
+        return EL1;
+      case TranslationRegime::EL20:
+      case TranslationRegime::EL2:
+        return EL2;
+      case TranslationRegime::EL3:
+        return EL3;
+      default:
+        return EL1;
+    }
+}
+
 } // namespace ArmISA
 } // namespace gem5
diff --git a/src/arch/arm/utility.hh b/src/arch/arm/utility.hh
index 8ccb251fa5..7e10f19ab3 100644
--- a/src/arch/arm/utility.hh
+++ b/src/arch/arm/utility.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010, 2012-2013, 2016-2020, 2022-2023 Arm Limited
+ * Copyright (c) 2010, 2012-2013, 2016-2020, 2022-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -217,6 +217,8 @@ int computeAddrTop(ThreadContext *tc, bool selbit, bool isInstr,
 
 bool isSecureBelowEL3(ThreadContext *tc);
 
+bool isSecureAtEL(ThreadContext *tc, ExceptionLevel el);
+
 bool longDescFormatInUse(ThreadContext *tc);
 
 /** This helper function is either returing the value of
@@ -367,10 +369,13 @@ void syncVecElemsToRegs(ThreadContext *tc);
 bool fgtEnabled(ThreadContext *tc);
 bool isHcrxEL2Enabled(ThreadContext *tc);
 
+TranslationRegime translationRegime(ThreadContext *tc, ExceptionLevel el);
+ExceptionLevel translationEl(TranslationRegime regime);
+
 static inline bool
-useVMID(ExceptionLevel el, bool in_host)
+useVMID(TranslationRegime regime)
 {
-    return el == EL1 || (el == EL0 && !in_host);
+    return regime == TranslationRegime::EL10;
 }
 
 } // namespace ArmISA
diff --git a/src/arch/generic/BaseSemihosting.py b/src/arch/generic/BaseSemihosting.py
new file mode 100644
index 0000000000..dd0df92e3a
--- /dev/null
+++ b/src/arch/generic/BaseSemihosting.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2018, 2019 Arm Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.SimObject import *
+
+
+class BaseSemihosting(SimObject):
+    type = "BaseSemihosting"
+    abstract = True
+    cxx_header = "arch/generic/semihosting.hh"
+    cxx_class = "gem5::BaseSemihosting"
+
+    cmd_line = Param.String("", "Command line to report to guest")
+    stdin = Param.String("stdin", "Standard input (stdin for gem5's terminal)")
+    stdout = Param.String(
+        "stdout", "Standard output (stdout for gem5's terminal)"
+    )
+    stderr = Param.String(
+        "stderr", "Standard error (stderr for gem5's terminal)"
+    )
+    files_root_dir = Param.String(
+        "", "Host root directory for files handled by Semihosting"
+    )
+
+    mem_reserve = Param.MemorySize(
+        "32MiB",
+        "Amount of memory to reserve at the start of the address map. This "
+        "memory won't be used by the heap reported to an application.",
+    )
+    stack_size = Param.MemorySize("32MiB", "Application stack size")
+
+    time = Param.Time(
+        "01/01/2009", "System time to use ('Now' for actual time)"
+    )
diff --git a/src/arch/generic/SConscript b/src/arch/generic/SConscript
index 88d8f3b3dc..b2bef74b61 100644
--- a/src/arch/generic/SConscript
+++ b/src/arch/generic/SConscript
@@ -40,6 +40,10 @@ Import('*')
 
 Source('htm.cc')
 Source('mmu.cc')
+if env['CONF']['USE_ARM_ISA'] or env['CONF']['USE_RISCV_ISA']:
+     Source('semihosting.cc')
+     SimObject('BaseSemihosting.py', sim_objects=['BaseSemihosting'])
+     DebugFlag('Semihosting')
 
 SimObject('BaseInterrupts.py', sim_objects=['BaseInterrupts'])
 SimObject('BaseISA.py', sim_objects=['BaseISA'])
diff --git a/src/arch/generic/isa.hh b/src/arch/generic/isa.hh
index e9e4d95d7b..b324591995 100644
--- a/src/arch/generic/isa.hh
+++ b/src/arch/generic/isa.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited
+ * Copyright (c) 2020, 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -61,12 +61,16 @@ class BaseISA : public SimObject
     typedef std::vector<const RegClass *> RegClasses;
 
   protected:
-    using SimObject::SimObject;
+    BaseISA(const SimObjectParams &p, const std::string &name)
+      : SimObject(p), isaName(name)
+    {}
 
     ThreadContext *tc = nullptr;
 
     RegClasses _regClasses;
 
+    std::string isaName;
+
   public:
     virtual PCStateBase *newPCState(Addr new_inst_addr=0) const = 0;
     virtual void clear() {}
@@ -87,6 +91,7 @@ class BaseISA : public SimObject
     virtual void resetThread() { panic("Thread reset not implemented."); }
 
     const RegClasses &regClasses() const { return _regClasses; }
+    const std::string getIsaName() const { return isaName; }
 
     // Locked memory handling functions.
     virtual void handleLockedRead(const RequestPtr &req) {}
@@ -126,6 +131,21 @@ class BaseISA : public SimObject
     {
         globalClearExclusive();
     }
+
+    void
+    serialize(CheckpointOut &cp) const override
+    {
+        SERIALIZE_SCALAR(isaName);
+    }
+
+    /**
+     * This function returns the vector length of the Vector Length Agnostic
+     * extension of the ISA.
+     * For ARM ISA, this function returns the SVE/SVE2 vector length.
+     * For RISC-V ISA, this function returns the RVV vector length.
+     * For other ISAs, this function returns -1.
+     */
+    virtual int64_t getVectorLengthInBytes() const { return -1; }
 };
 
 } // namespace gem5
diff --git a/src/arch/generic/mmu.cc b/src/arch/generic/mmu.cc
index a765228dd5..feb03a0a70 100644
--- a/src/arch/generic/mmu.cc
+++ b/src/arch/generic/mmu.cc
@@ -149,8 +149,10 @@ BaseMMU::MMUTranslationGen::translate(Range &range) const
 
     range.fault = mmu->translateFunctional(req, tc, mode);
 
-    if (range.fault == NoFault)
+    if (range.fault == NoFault) {
         range.paddr = req->getPaddr();
+        range.flags = req->getFlags();
+    }
 }
 
 void
diff --git a/src/arch/generic/semihosting.cc b/src/arch/generic/semihosting.cc
new file mode 100644
index 0000000000..a68148e40c
--- /dev/null
+++ b/src/arch/generic/semihosting.cc
@@ -0,0 +1,880 @@
+/*
+ * Copyright (c) 2018, 2019 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/generic/semihosting.hh"
+
+#include <unistd.h>
+
+#include <cerrno>
+#include <cstdio>
+
+#include "base/logging.hh"
+#include "base/output.hh"
+#include "base/time.hh"
+#include "debug/Semihosting.hh"
+#include "dev/serial/serial.hh"
+#include "mem/physical.hh"
+#include "params/BaseSemihosting.hh"
+#include "sim/byteswap.hh"
+#include "sim/pseudo_inst.hh"
+#include "sim/sim_exit.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+const std::vector<const char *> BaseSemihosting::fmodes{
+        "r",
+        "rb",
+        "r+",
+        "r+b",
+        "w",
+        "wb",
+        "w+",
+        "w+b",
+        "a",
+        "ab",
+        "a+",
+        "a+b",
+};
+
+const std::map<uint64_t, const char *> BaseSemihosting::exitCodes{
+        {0x20000, "semi:ADP_Stopped_BranchThroughZero"},
+        {0x20001, "semi:ADP_Stopped_UndefinedInstr"},
+        {0x20002, "semi:ADP_Stopped_SoftwareInterrupt"},
+        {0x20003, "semi:ADP_Stopped_PrefetchAbort"},
+        {0x20004, "semi:ADP_Stopped_DataAbort"},
+        {0x20005, "semi:ADP_Stopped_AddressException"},
+        {0x20006, "semi:ADP_Stopped_IRQ"},
+        {0x20007, "semi:ADP_Stopped_FIQ"},
+
+        {0x20020, "semi:ADP_Stopped_BreakPoint"},
+        {0x20021, "semi:ADP_Stopped_WatchPoint"},
+        {0x20022, "semi:ADP_Stopped_StepComplete"},
+        {0x20023, "semi:ADP_Stopped_RunTimeErrorUnknown"},
+        {0x20024, "semi:ADP_Stopped_InternalError"},
+        {0x20025, "semi:ADP_Stopped_UserInterruption"},
+        {0x20026, "semi:ADP_Stopped_ApplicationExit"},
+        {0x20027, "semi:ADP_Stopped_StackOverflow"},
+        {0x20028, "semi:ADP_Stopped_DivisionByZero"},
+        {0x20029, "semi:ADP_Stopped_DivisionByZero"},
+};
+
+const std::array<uint8_t, 5> BaseSemihosting::features{
+        0x53, 0x48, 0x46, 0x42, // Magic
+        0x3, // EXT_EXIT_EXTENDED, EXT_STDOUT_STDERR
+};
+
+const std::map<const std::string, FILE *> BaseSemihosting::stdioMap{
+        {"cin", ::stdin},
+        {"stdin", ::stdin},
+        {"cout", ::stdout},
+        {"stdout", ::stdout},
+        {"cerr", ::stderr},
+        {"stderr", ::stderr},
+};
+
+BaseSemihosting::BaseSemihosting(const BaseSemihostingParams &p)
+    : SimObject(p), cmdLine(p.cmd_line), memReserve(p.mem_reserve),
+    stackSize(p.stack_size), timeBase([p] {
+        struct tm t = p.time;
+        return mkutctime(&t);
+    }()),
+    tickShift(calcTickShift()), semiErrno(0),
+    filesRootDir(!p.files_root_dir.empty() && p.files_root_dir.back() != '/' ?
+                         p.files_root_dir + '/' :
+                         p.files_root_dir),
+    stdin(getSTDIO("stdin", p.stdin, "r")),
+    stdout(getSTDIO("stdout", p.stdout, "w")),
+    stderr(p.stderr == p.stdout ? stdout : getSTDIO("stderr", p.stderr, "w"))
+{
+    // Create an empty place-holder file for position 0 as semi-hosting
+    // calls typically expect non-zero file handles.
+    files.push_back(nullptr);
+
+    if (tickShift > 0)
+        inform("Semihosting: Shifting elapsed ticks by %i bits.", tickShift);
+}
+
+void
+BaseSemihosting::serialize(CheckpointOut &cp) const
+{
+    SERIALIZE_SCALAR(semiErrno);
+
+    paramOut(cp, "num_files", files.size());
+    for (int i = 0; i < files.size(); i++) {
+        // File closed?
+        if (!files[i])
+            continue;
+
+        files[i]->serializeSection(cp, csprintf("file%i", i));
+    }
+}
+
+void
+BaseSemihosting::unserialize(CheckpointIn &cp)
+{
+    UNSERIALIZE_SCALAR(semiErrno);
+
+    size_t num_files;
+    paramIn(cp, "num_files", num_files);
+    files.resize(num_files);
+    for (int i = 0; i < num_files; i++)
+        files[i] = FileBase::create(*this, cp, csprintf("file%i", i));
+}
+
+std::optional<std::string>
+BaseSemihosting::readString(ThreadContext *tc, Addr ptr, size_t len)
+{
+    if (len > 65536) {
+      // If the semihosting call passes an incorrect argument, reject it rather
+      // than attempting to allocate a buffer of that size. We chose 64K as
+      // an arbitrary limit here since no valid program should be attempting
+      // to open a file with such a large filename.
+      warn("BaseSemihosting::readString(): attempting to read too large "
+           "(%d bytes) string from %#x", len, ptr);
+      return std::nullopt;
+    }
+    std::vector<char> buf(len + 1);
+
+    buf[len] = '\0';
+    portProxy(tc).readBlob(ptr, buf.data(), len);
+
+    return std::string(buf.data());
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callOpen(
+        ThreadContext *tc, const Addr name_base, int fmode, size_t name_size)
+{
+    const char *mode = fmode < fmodes.size() ? fmodes[fmode] : nullptr;
+
+    DPRINTF(Semihosting, "Semihosting SYS_OPEN(0x%x, %i[%s], %i)\n", name_base,
+            fmode, mode ? mode : "-", name_size);
+    if (!mode || !name_base)
+        return retError(EINVAL);
+
+    std::optional<std::string> fnameOpt = readString(tc, name_base, name_size);
+    if (!fnameOpt.has_value())
+        return retError(ERANGE);
+    std::string fname = *fnameOpt;
+    if (!fname.empty() && fname.front() != '/' && fname != ":tt" &&
+            fname != ":semihosting-features")
+        fname = filesRootDir + fname;
+
+    std::unique_ptr<BaseSemihosting::FileBase> file =
+            FileBase::create(*this, fname, mode);
+    int64_t ret = file->open();
+    DPRINTF(Semihosting, "Semihosting SYS_OPEN(\"%s\", %i[%s]): %i\n", fname,
+            fmode, mode, ret);
+    if (ret < 0) {
+        return retError(-ret);
+    } else {
+        files.push_back(std::move(file));
+        return retOK(files.size() - 1);
+    }
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callClose(ThreadContext *tc, Handle handle)
+{
+    if (handle > files.size()) {
+        DPRINTF(Semihosting, "Semihosting SYS_CLOSE(%i): Illegal file\n");
+        return retError(EBADF);
+    }
+
+    std::unique_ptr<FileBase> &file = files[handle];
+    int64_t error = file->close();
+    DPRINTF(Semihosting, "Semihosting SYS_CLOSE(%i[%s]): %i\n", handle,
+            file->fileName(), error);
+    if (error < 0) {
+        return retError(-error);
+    } else {
+        // Zap the pointer and free the entry in the file table as
+        // well.
+        files[handle].reset();
+        return retOK(0);
+    }
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callWriteC(ThreadContext *tc, InPlaceArg arg)
+{
+    const char c = portProxy(tc).read<char>(arg.addr);
+
+    DPRINTF(Semihosting, "Semihosting SYS_WRITEC('%c')\n", c);
+    std::cout.put(c);
+    std::cout.flush();
+
+    return retOK(0);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callWrite0(ThreadContext *tc, InPlaceArg arg)
+{
+    DPRINTF(Semihosting, "Semihosting SYS_WRITE0(...)\n");
+    PortProxy &proxy = portProxy(tc);
+    std::string str;
+    proxy.readString(str, arg.addr);
+    std::cout.write(str.c_str(), str.size());
+    std::cout.flush();
+
+    return retOK(0);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callWrite(
+        ThreadContext *tc, Handle handle, Addr addr, size_t size)
+{
+    if (handle > files.size() || !files[handle])
+        return RetErrno(size, EBADF);
+
+    std::vector<uint8_t> buffer(size);
+    portProxy(tc).readBlob(addr, buffer.data(), buffer.size());
+
+    int64_t ret = files[handle]->write(buffer.data(), buffer.size());
+    if (ret < 0) {
+        // No bytes written (we're returning the number of bytes not
+        // written)
+        return RetErrno(size, -ret);
+    } else {
+        // Return the number of bytes not written
+        return RetErrno(size - ret, 0);
+    }
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callRead(
+        ThreadContext *tc, Handle handle, Addr addr, size_t size)
+{
+    if (handle > files.size() || !files[handle])
+        return RetErrno(size, EBADF);
+
+    std::vector<uint8_t> buffer(size);
+    int64_t ret = files[handle]->read(buffer.data(), buffer.size());
+    if (ret < 0) {
+        return RetErrno(size, -ret);
+    } else {
+        panic_if(ret > buffer.size(), "Read longer than buffer size.");
+
+        portProxy(tc).writeBlob(addr, buffer.data(), ret);
+
+        // Return the number of bytes not written
+        return retOK(size - ret);
+    }
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callReadC(ThreadContext *tc)
+{
+    return retOK((char)std::cin.get());
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callIsError(ThreadContext *tc, int64_t status)
+{
+    return retOK(status < 0 ? 1 : 0);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callIsTTY(ThreadContext *tc, Handle handle)
+{
+    if (handle > files.size() || !files[handle])
+        return retError(EBADF);
+
+    int64_t ret = files[handle]->isTTY();
+    if (ret < 0) {
+        return retError(-ret);
+    } else {
+        return retOK(ret ? 1 : 0);
+    }
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callSeek(ThreadContext *tc, Handle handle, uint64_t pos)
+{
+    if (handle > files.size() || !files[handle])
+        return retError(EBADF);
+
+    int64_t ret = files[handle]->seek(pos);
+    if (ret < 0) {
+        return retError(-ret);
+    } else {
+        return retOK(0);
+    }
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callFLen(ThreadContext *tc, Handle handle)
+{
+    if (handle > files.size() || !files[handle])
+        return retError(EBADF);
+
+    int64_t ret = files[handle]->flen();
+    if (ret < 0) {
+        return retError(-ret);
+    } else {
+        return retOK(ret);
+    }
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callTmpNam(
+        ThreadContext *tc, Addr addr, uint64_t id, size_t size)
+{
+    std::string path = "";
+    int64_t unlink_call_ret = 0;
+
+    do {
+        path = simout.resolve(csprintf("%s.tmp%05i", name(), tmpNameIndex++));
+        // remove the (potentially existing) file of the given path
+        unlink_call_ret = unlink(path.c_str());
+        // if the file is busy, find another name
+    } while ((unlink_call_ret < 0) && (errno == EBUSY));
+
+    const size_t path_len = path.length();
+    if (path_len >= size)
+        return retError(ENOSPC);
+
+    portProxy(tc).writeBlob(addr, path.c_str(), path_len + 1);
+    return retOK(0);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callRemove(
+        ThreadContext *tc, Addr name_base, size_t name_size)
+{
+    std::optional<std::string> fname = readString(tc, name_base, name_size);
+
+    if (!fname.has_value()) {
+        return retError(ERANGE);
+    } else if (remove(fname->c_str()) != 0) {
+        return retError(errno);
+    } else {
+        return retOK(0);
+    }
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callRename(ThreadContext *tc, Addr from_addr,
+        size_t from_size, Addr to_addr, size_t to_size)
+{
+    std::optional<std::string> from = readString(tc, from_addr, from_size);
+    std::optional<std::string> to = readString(tc, to_addr, to_size);
+    if (!from.has_value() || !to.has_value()) {
+        return retError(ERANGE);
+    } else if (rename(from->c_str(), to->c_str()) != 0) {
+        return retError(errno);
+    } else {
+        return retOK(0);
+    }
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callClock(ThreadContext *tc)
+{
+    return retOK(curTick() / (sim_clock::as_int::s / 100));
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callTime(ThreadContext *tc)
+{
+    return retOK(timeBase + round(curTick() / sim_clock::as_float::s));
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callSystem(ThreadContext *tc, Addr cmd_addr, size_t cmd_size)
+{
+    const std::optional<std::string> cmd = readString(tc, cmd_addr, cmd_size);
+    if (!cmd.has_value())
+      return retError(ERANGE);
+    warn("Semihosting: SYS_SYSTEM not implemented. Guest tried to run: %s\n",
+            *cmd);
+    return retError(EINVAL);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callErrno(ThreadContext *tc)
+{
+    // Preserve errno by returning it in errno as well.
+    return RetErrno(semiErrno, semiErrno);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callGetCmdLine(
+        ThreadContext *tc, Addr addr, InPlaceArg size_arg)
+{
+    PortProxy &proxy = portProxy(tc);
+    ByteOrder endian = byteOrder(tc);
+    size_t size = size_arg.read(tc, proxy, endian);
+
+    if (cmdLine.size() + 1 < size) {
+        proxy.writeBlob(addr, cmdLine.c_str(), cmdLine.size() + 1);
+        size_arg.write(tc, proxy, cmdLine.size(), endian);
+        return retOK(0);
+    } else {
+        return retError(0);
+    }
+}
+
+void
+BaseSemihosting::gatherHeapInfo(ThreadContext *tc, bool aarch64,
+        Addr &heap_base, Addr &heap_limit, Addr &stack_base, Addr &stack_limit)
+{
+    const memory::PhysicalMemory &phys = tc->getSystemPtr()->getPhysMem();
+    const AddrRangeList memories = phys.getConfAddrRanges();
+    fatal_if(memories.size() < 1, "No memories reported from System");
+    warn_if(memories.size() > 1,
+            "Multiple physical memory ranges available. "
+            "Using first range heap/stack.");
+    const AddrRange mem = *memories.begin();
+    const Addr mem_start = mem.start() + memReserve;
+    Addr mem_end = mem.end();
+
+    // Make sure that 32-bit guests can access their memory.
+    if (!aarch64) {
+        const Addr phys_max = (1ULL << 32) - 1;
+        panic_if(mem_start > phys_max,
+                "Physical memory out of range for a 32-bit guest.");
+        if (mem_end > phys_max) {
+            warn("Some physical memory out of range for a 32-bit guest.");
+            mem_end = phys_max;
+        }
+    }
+
+    fatal_if(mem_start + stackSize >= mem_end,
+            "Physical memory too small to fit desired stack and a heap.");
+
+    heap_base = mem_start;
+    heap_limit = mem_end - stackSize + 1;
+    stack_base = (mem_end + 1) & ~0x7ULL; // 8 byte stack alignment
+    stack_limit = heap_limit;
+
+    inform("Reporting heap/stack info to guest:\n"
+           "\tHeap base: 0x%x\n"
+           "\tHeap limit: 0x%x\n"
+           "\tStack base: 0x%x\n"
+           "\tStack limit: 0x%x\n",
+            heap_base, heap_limit, stack_base, stack_limit);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callHeapInfo32(ThreadContext *tc, Addr block_addr)
+{
+    uint64_t heap_base, heap_limit, stack_base, stack_limit;
+    gatherHeapInfo(tc, false, heap_base, heap_limit, stack_base, stack_limit);
+
+    std::array<uint32_t, 4> block = {
+            {(uint32_t)heap_base, (uint32_t)heap_limit, (uint32_t)stack_base,
+                    (uint32_t)stack_limit}};
+    portProxy(tc).write(block_addr, block, byteOrder(tc));
+
+    return retOK(0);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callHeapInfo64(ThreadContext *tc, Addr block_addr)
+{
+    uint64_t heap_base, heap_limit, stack_base, stack_limit;
+    gatherHeapInfo(tc, true, heap_base, heap_limit, stack_base, stack_limit);
+
+    std::array<uint64_t, 4> block = {
+            {heap_base, heap_limit, stack_base, stack_limit}};
+    portProxy(tc).write(block_addr, block, byteOrder(tc));
+
+    return retOK(0);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callExit32(ThreadContext *tc, InPlaceArg code)
+{
+    semiExit(code.addr, 0);
+    return retOK(0);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callExit64(ThreadContext *tc, uint64_t code, uint64_t subcode)
+{
+    semiExit(code, subcode);
+    return retOK(0);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callExitExtended(
+        ThreadContext *tc, uint64_t code, uint64_t subcode)
+{
+    semiExit(code, subcode);
+    return retOK(0);
+}
+
+void
+BaseSemihosting::semiExit(uint64_t code, uint64_t subcode)
+{
+    auto it = exitCodes.find(code);
+    if (it != exitCodes.end()) {
+        exitSimLoop(it->second, subcode);
+    } else {
+        exitSimLoop(csprintf("semi:0x%x", code), subcode);
+    }
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callElapsed32(
+        ThreadContext *tc, InPlaceArg low, InPlaceArg high)
+{
+    PortProxy &proxy = portProxy(tc);
+    ByteOrder endian = byteOrder(tc);
+    uint64_t tick = semiTick(curTick());
+
+    low.write(tc, proxy, tick, endian);
+    high.write(tc, proxy, tick >> 32, endian);
+
+    return retOK(0);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callElapsed64(ThreadContext *tc, InPlaceArg ticks)
+{
+    ticks.write(tc, portProxy(tc), semiTick(curTick()), byteOrder(tc));
+    return retOK(0);
+}
+
+BaseSemihosting::RetErrno
+BaseSemihosting::callTickFreq(ThreadContext *tc)
+{
+    return retOK(semiTick(sim_clock::Frequency));
+}
+
+FILE *
+BaseSemihosting::getSTDIO(
+        const char *stream_name, const std::string &name, const char *mode)
+{
+    auto it = stdioMap.find(name);
+    if (it == stdioMap.end()) {
+        FILE *f = fopen(name.c_str(), mode);
+        if (!f) {
+            fatal("Failed to open %s (%s): %s\n", stream_name, name,
+                    strerror(errno));
+        }
+        return f;
+    } else {
+        return it->second;
+    }
+}
+
+std::unique_ptr<BaseSemihosting::FileBase>
+BaseSemihosting::FileBase::create(
+        BaseSemihosting &parent, const std::string &fname, const char *mode)
+{
+    std::unique_ptr<FileBase> file;
+    if (fname == ":semihosting-features") {
+        file.reset(new FileFeatures(parent, fname.c_str(), mode));
+    } else {
+        file.reset(new File(parent, fname.c_str(), mode));
+    }
+
+    return file;
+}
+
+std::unique_ptr<BaseSemihosting::FileBase>
+BaseSemihosting::FileBase::create(
+        BaseSemihosting &parent, CheckpointIn &cp, const std::string &sec)
+{
+    std::unique_ptr<FileBase> file;
+    ScopedCheckpointSection _sec(cp, sec);
+
+    // Was the file open when the checkpoint was created?
+    if (!cp.sectionExists(Serializable::currentSection()))
+        return file;
+
+    std::string fname, mode;
+    paramIn(cp, "name", fname);
+    paramIn(cp, "mode", mode);
+    file = create(parent, fname, mode.c_str());
+    assert(file);
+    file->unserialize(cp);
+
+    return file;
+}
+
+void
+BaseSemihosting::FileBase::serialize(CheckpointOut &cp) const
+{
+    paramOut(cp, "name", _name);
+    SERIALIZE_SCALAR(mode);
+}
+
+void
+BaseSemihosting::FileBase::unserialize(CheckpointIn &cp)
+{
+    /* Unserialization of name and mode happens in
+     * BaseSemihosting::FileBase::create() */
+}
+
+int64_t
+BaseSemihosting::FileBase::read(uint8_t *buffer, uint64_t size)
+{
+    return -EINVAL;
+}
+
+int64_t
+BaseSemihosting::FileBase::write(const uint8_t *buffer, uint64_t size)
+{
+    return -EINVAL;
+}
+
+int64_t
+BaseSemihosting::FileBase::seek(uint64_t pos)
+{
+    return -EINVAL;
+}
+
+int64_t
+BaseSemihosting::FileBase::flen()
+{
+    return -EINVAL;
+}
+
+BaseSemihosting::FileFeatures::
+FileFeatures(BaseSemihosting &_parent, const char *_name, const char *_mode) :
+    FileBase(_parent, _name, _mode)
+{}
+
+int64_t
+BaseSemihosting::FileFeatures::flen()
+{
+    return features.size();
+}
+
+int64_t
+BaseSemihosting::FileFeatures::read(uint8_t *buffer, uint64_t size)
+{
+    int64_t len = 0;
+
+    for (; len < size && pos < features.size(); pos++)
+        buffer[len++] = features[pos];
+
+    return len;
+}
+
+int64_t
+BaseSemihosting::FileFeatures::seek(uint64_t _pos)
+{
+    if (_pos < BaseSemihosting::features.size()) {
+        pos = _pos;
+        return 0;
+    } else {
+        return -ENXIO;
+    }
+}
+
+void
+BaseSemihosting::FileFeatures::serialize(CheckpointOut &cp) const
+{
+    FileBase::serialize(cp);
+    SERIALIZE_SCALAR(pos);
+}
+
+void
+BaseSemihosting::FileFeatures::unserialize(CheckpointIn &cp)
+{
+    FileBase::unserialize(cp);
+    UNSERIALIZE_SCALAR(pos);
+}
+
+BaseSemihosting::File::
+File(BaseSemihosting &_parent, const char *_name, const char *_perms) :
+    FileBase(_parent, _name, _perms), file(nullptr)
+{}
+
+BaseSemihosting::File::~
+File()
+{
+    if (file)
+        close();
+}
+
+int64_t
+BaseSemihosting::File::openImpl(bool in_cpt)
+{
+    panic_if(file, "Trying to open an already open file.\n");
+
+    if (_name == ":tt") {
+        if (mode[0] == 'r') {
+            file = parent.stdin;
+        } else if (mode[0] == 'w') {
+            file = parent.stdout;
+        } else if (mode[0] == 'a') {
+            file = parent.stderr;
+        } else {
+            warn("Unknown file mode for the ':tt' special file");
+            return -EINVAL;
+        }
+    } else {
+        std::string real_mode(this->mode);
+        // Avoid truncating the file if we are restoring from a
+        // checkpoint.
+        if (in_cpt && real_mode[0] == 'w')
+            real_mode[0] = 'a';
+
+        file = fopen(_name.c_str(), real_mode.c_str());
+    }
+
+    return file ? 0 : -errno;
+}
+
+int64_t
+BaseSemihosting::File::close()
+{
+    panic_if(!file, "Trying to close an already closed file.\n");
+
+    if (needClose()) {
+        fclose(file);
+    }
+    file = nullptr;
+
+    return 0;
+}
+
+bool
+BaseSemihosting::File::isTTY() const
+{
+    return file == parent.stdout || file == parent.stderr ||
+           file == parent.stdin;
+}
+
+int64_t
+BaseSemihosting::File::read(uint8_t *buffer, uint64_t size)
+{
+    panic_if(!file, "Trying to read from a closed file");
+
+    size_t ret = fread(buffer, 1, size, file);
+    if (ret == 0) {
+        // Error or EOF. Assume errors are due to invalid file
+        // operations (e.g., reading a write-only stream).
+        return ferror(file) ? -EINVAL : 0;
+    } else {
+        return ret;
+    }
+}
+
+int64_t
+BaseSemihosting::File::write(const uint8_t *buffer, uint64_t size)
+{
+    panic_if(!file, "Trying to write to a closed file");
+
+    size_t ret = fwrite(buffer, 1, size, file);
+    if (ret == 0) {
+        // Assume errors are due to invalid file operations (e.g.,
+        // writing a read-only stream).
+        return -EINVAL;
+    } else {
+        return ret;
+    }
+}
+
+int64_t
+BaseSemihosting::File::seek(uint64_t _pos)
+{
+    panic_if(!file, "Trying to seek in a closed file");
+
+    errno = 0;
+    if (fseek(file, _pos, SEEK_SET) == 0)
+        return 0;
+    else
+        return -errno;
+}
+
+int64_t
+BaseSemihosting::File::flen()
+{
+    errno = 0;
+    long pos = ftell(file);
+    if (pos < 0)
+        return -errno;
+
+    if (fseek(file, 0, SEEK_END) != 0)
+        return -errno;
+
+    long len = ftell(file);
+    if (len < 0)
+        return -errno;
+
+    if (fseek(file, pos, SEEK_SET) != 0)
+        return -errno;
+
+    return len;
+}
+
+void
+BaseSemihosting::File::serialize(CheckpointOut &cp) const
+{
+    FileBase::serialize(cp);
+
+    if (!isTTY()) {
+        long pos = file ? ftell(file) : 0;
+        panic_if(pos < 0, "Failed to get file position.");
+        SERIALIZE_SCALAR(pos);
+    }
+}
+
+void
+BaseSemihosting::File::unserialize(CheckpointIn &cp)
+{
+    FileBase::unserialize(cp);
+
+    if (openImpl(true) < 0) {
+        fatal("Failed to open file: %s", _name);
+    }
+
+    if (!isTTY()) {
+        long pos = 0;
+        UNSERIALIZE_SCALAR(pos);
+        if (fseek(file, pos, SEEK_SET) != 0) {
+            fatal("Failed seek to current position (%i) in '%s'", pos, _name);
+        }
+    }
+}
+
+std::ostream &
+operator<<(std::ostream &os, const BaseSemihosting::InPlaceArg &ipa)
+{
+    ccprintf(os, "[%#x-%#x)", ipa.addr, ipa.addr + ipa.size - 1);
+    return os;
+}
+
+} // namespace gem5
diff --git a/src/arch/generic/semihosting.hh b/src/arch/generic/semihosting.hh
new file mode 100644
index 0000000000..fe571b7af2
--- /dev/null
+++ b/src/arch/generic/semihosting.hh
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 2018, 2019 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_GENERIC_SEMIHOSTING_HH__
+#define __ARCH_GENERIC_SEMIHOSTING_HH__
+
+#include <cstdio>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "base/time.hh"
+#include "cpu/thread_context.hh"
+#include "mem/port_proxy.hh"
+#include "sim/core.hh"
+#include "sim/guest_abi.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+struct BaseSemihostingParams;
+class SerialDevice;
+
+/**
+ * Semihosting for AArch32, AArch64, RISCV-32 and RISCV-64:
+ * https://github.com/ARM-software/abi-aa/blob/main/semihosting/semihosting.rst
+ *
+ * This class implements the Arm semihosting interface. This interface
+ * allows baremetal code access service, such as IO, from the
+ * simulator. It is conceptually a simplified version of gem5's more
+ * general syscall emulation mode.
+ *
+ * Note: The RISC-V semihosting specification reuses the Arm semihosting
+ * interfaces (https://github.com/riscv-non-isa/riscv-semihosting).
+ *
+ * Exits calls (SYS_EXIT, SYS_EXIT_EXTENDED) from the guest get
+ * translated into simualtion exits. Well-known exit codes are
+ * translated to messages on the form 'semi:ADP_.*' while unknown
+ * codes are returned in hex ('semi:0x..'). The subcode is reported in
+ * the gem5 exit event.
+ */
+class BaseSemihosting : public SimObject
+{
+    virtual PortProxy &portProxy(ThreadContext *tc) const = 0;
+    virtual ByteOrder byteOrder(ThreadContext *tc) const = 0;
+
+  public:
+    struct AbiBase
+    {
+        template <typename Arg, class BaseSemihostingImpl>
+        class StateBase
+        {
+          private:
+            Addr argPointer;
+            ByteOrder endian;
+
+          public:
+            StateBase(const ThreadContext *tc, Addr arg_pointer,
+                    std::function<ByteOrder(const ThreadContext *tc)>
+                            getByteOrder) :
+                argPointer(arg_pointer), endian(getByteOrder(tc))
+            {}
+
+            /*
+             * These two methods are used to both read an argument or its
+             * address, and to move position on to the next location. Normally
+             * State would be more passive, but since it behaves almost the
+             * same no matter what the argument type is we can simplify and
+             * consolidate a little bit by centralizing these methods.
+             */
+
+            // Return the address of an argument slot and move past it.
+            Addr
+            getAddr()
+            {
+                Addr addr = argPointer;
+                argPointer += sizeof(Arg);
+                return addr;
+            }
+
+            // Read the value in an argument slot and move past it.
+            Arg
+            get(ThreadContext *tc)
+            {
+                Arg arg = BaseSemihostingImpl::portProxyImpl(tc)
+                                  .template read<Arg>(argPointer, endian);
+                argPointer += sizeof(Arg);
+                return arg;
+            }
+
+            using ArgType = Arg;
+        };
+    };
+
+    // Use this argument type when you need to modify an argument in place.
+    // This will give you the address of the argument itself and the size of
+    // each argument slot, rather than the actual value of the argument.
+    struct InPlaceArg
+    {
+        Addr addr;
+        size_t size;
+
+        InPlaceArg(Addr _addr, size_t _size) : addr(_addr), size(_size) {}
+
+        // A helper function to read the argument since the guest ABI mechanism
+        // didn't do that for us.
+        uint64_t
+        read(ThreadContext *tc, PortProxy &proxy, ByteOrder endian)
+        {
+            if (size == 8)
+                return proxy.read<uint64_t>(addr, endian);
+            else if (size == 4)
+                return proxy.read<uint32_t>(addr, endian);
+            else
+                panic("Unexpected semihosting argument size %d.", size);
+        }
+
+        // A helper function to write to the argument's slot in the params.
+        void
+        write(ThreadContext *tc, PortProxy &proxy, uint64_t val,
+                ByteOrder endian)
+        {
+            if (size == 8)
+                proxy.write<uint64_t>(addr, val, endian);
+            else if (size == 4)
+                proxy.write<uint32_t>(addr, val, endian);
+            else
+                panic("Unexpected semihosting argument size %d.", size);
+        }
+    };
+
+  protected:
+    explicit BaseSemihosting(const BaseSemihostingParams &p);
+
+  public: // SimObject and related interfaces
+    void serialize(CheckpointOut &cp) const override;
+    void unserialize(CheckpointIn &cp) override;
+
+  protected: // Configuration
+    const std::string cmdLine;
+    const Addr memReserve;
+    const Addr stackSize;
+
+    /**
+     * Base time when the simulation started. This is used to
+     * calculate the time of date when the guest call SYS_TIME.
+     */
+    const time_t timeBase;
+
+    /** Number of bits to right shift gem5 ticks to fit in a uint32_t */
+    const unsigned tickShift;
+
+  protected: // Internal state
+    typedef uint64_t SemiErrno;
+    SemiErrno semiErrno;
+
+  protected: // File IO
+    /**
+     * Internal state for open files
+     *
+     * This class describes the internal state of a file opened
+     * through the semihosting interface.
+     *
+     * A file instance is normally created using one of the
+     * BaseSemihosting::FileBase::create() factory methods. These
+     * methods handle some the magic file names in the Arm/RISC-V semihosting
+     * specification and instantiate the right implementation. For the
+     * same, when unserializing a checkpoint, the create method must
+     * be used to unserialize a new instance of a file descriptor.
+     */
+    class FileBase : public Serializable
+    {
+      public:
+        FileBase(BaseSemihosting &_parent, const char *name,
+                const char *_mode) : parent(_parent), _name(name), mode(_mode)
+        {}
+        virtual ~FileBase(){};
+
+        FileBase() = delete;
+        FileBase(FileBase &) = delete;
+
+        static std::unique_ptr<FileBase> create(BaseSemihosting &parent,
+                const std::string &fname, const char *mode);
+        static std::unique_ptr<FileBase> create(BaseSemihosting &parent,
+                CheckpointIn &cp, const std::string &sec);
+
+        void serialize(CheckpointOut &cp) const override;
+        void unserialize(CheckpointIn &cp) override;
+
+        const std::string &
+        fileName()
+        {
+            return _name;
+        }
+
+      public:
+        /** @{
+         * Semihosting file IO interfaces
+         *
+         * These interfaces implement common IO functionality in the
+         * Semihosting interface.
+         *
+         * All functions return a negative value that corresponds to a
+         * UNIX errno value when they fail and >=0 on success.
+         */
+
+        /**
+         * Open the the file.
+         *
+         * @return <0 on error (-errno), 0 on success.
+         */
+        virtual int64_t
+        open()
+        {
+            return 0;
+        }
+
+        /**
+         * Close the file.
+         *
+         * @return <0 on error (-errno), 0 on success.
+         */
+        virtual int64_t
+        close()
+        {
+            return 0;
+        }
+
+        /**
+         * Check if a file corresponds to a TTY device.
+         *
+         * @return True if the file is a TTY, false otherwise.
+         */
+        virtual bool
+        isTTY() const
+        {
+            return false;
+        }
+
+        /**
+         * Read data from file.
+         *
+         * @return <0 on error (-errno), bytes read on success (0 for EOF).
+         */
+        virtual int64_t read(uint8_t *buffer, uint64_t size);
+
+        /**
+         * Write data to file.
+         *
+         * @return <0 on error (-errno), bytes written on success.
+         */
+        virtual int64_t write(const uint8_t *buffer, uint64_t size);
+
+        /**
+         * Seek to an absolute position in the file.
+         *
+         * @param pos Byte offset from start of file.
+         * @return <0 on error (-errno), 0 on success.
+         */
+        virtual int64_t seek(uint64_t pos);
+
+        /**
+         * Get the length of a file in bytes.
+         *
+         * @return <0 on error (-errno), length on success
+         */
+        virtual int64_t flen();
+
+        /** @} */
+
+      protected:
+        BaseSemihosting &parent;
+        std::string _name;
+        std::string mode;
+    };
+
+    /** Implementation of the ':semihosting-features' magic file. */
+    class FileFeatures : public FileBase
+    {
+      public:
+        FileFeatures(
+                BaseSemihosting &_parent, const char *name, const char *mode);
+
+        void serialize(CheckpointOut &cp) const override;
+        void unserialize(CheckpointIn &cp) override;
+
+        int64_t read(uint8_t *buffer, uint64_t size) override;
+        int64_t seek(uint64_t pos) override;
+        int64_t flen() override;
+
+      protected:
+        size_t pos = 0;
+    };
+
+    class File : public FileBase
+    {
+      public:
+        File(BaseSemihosting &_parent, const char *name, const char *mode);
+        ~File();
+
+        void serialize(CheckpointOut &cp) const override;
+        void unserialize(CheckpointIn &cp) override;
+
+        int64_t
+        open() override
+        {
+            return openImpl(false);
+        }
+        int64_t close() override;
+        bool isTTY() const override;
+        int64_t read(uint8_t *buffer, uint64_t size) override;
+        int64_t write(const uint8_t *buffer, uint64_t size) override;
+        int64_t seek(uint64_t pos) override;
+        int64_t flen() override;
+
+      protected:
+        int64_t openImpl(bool unserialize);
+        bool
+        needClose() const
+        {
+            return !isTTY();
+        }
+
+        FILE *file;
+    };
+
+    std::string filesRootDir;
+    std::vector<std::unique_ptr<FileBase>> files;
+    using Handle = size_t;
+    FILE *stdin;
+    FILE *stdout;
+    FILE *stderr;
+
+  protected: // Helper functions
+    unsigned
+    calcTickShift() const
+    {
+        int msb = findMsbSet(sim_clock::Frequency);
+        return msb > 31 ? msb - 31 : 0;
+    }
+    uint64_t
+    semiTick(Tick tick) const
+    {
+        return tick >> tickShift;
+    }
+    void semiExit(uint64_t code, uint64_t subcode);
+
+    std::optional<std::string> readString(ThreadContext *tc,
+                                          Addr ptr, size_t len);
+
+  public:
+    typedef std::pair<uint64_t, SemiErrno> RetErrno;
+
+  protected:
+    static RetErrno
+    retError(SemiErrno e)
+    {
+        return RetErrno((uint64_t)-1, e);
+    }
+
+    static RetErrno
+    retOK(uint64_t r)
+    {
+        return RetErrno(r, 0);
+    }
+
+    /**
+     * Semihosting call information structure.
+     *
+     * This structure describes how a semi-hosting call is
+     * implemented. It contains debug information (e.g., the name of
+     * the call), and a way to invoke it in a particular context.
+     */
+    template <typename Semihosting, typename Abi32, typename Abi64>
+    struct SemiCallBase
+    {
+        /** Call name */
+        const char *name;
+
+        // A type for member functions implementing semihosting calls.
+        template <typename Impl, typename... Args>
+        using Implementation = RetErrno (Impl::*)(
+                ThreadContext *tc, Args... args);
+
+        // Since guest ABI doesn't know how to call member function pointers,
+        // this template builds a wrapper that takes care of that.
+        template <typename SemiImpl, typename... Args>
+        static inline std::function<RetErrno(ThreadContext *tc, Args... args)>
+        wrapImpl(SemiImpl *sh, Implementation<SemiImpl, Args...> impl)
+        {
+            return [sh, impl](ThreadContext *tc, Args... args) {
+                return (sh->*impl)(tc, args...);
+            };
+        }
+
+        // A type for functions which dispatch semihosting calls through the
+        // guest ABI mechanism.
+        using Dispatcher =
+                std::function<RetErrno(Semihosting *sh, ThreadContext *tc)>;
+        using Dumper = std::function<std::string(ThreadContext *tc)>;
+
+        // Dispatchers for 32 and 64 bits.
+        Dispatcher call32;
+        Dispatcher call64;
+
+        // Dumpers which print semihosting calls and their arguments.
+        Dumper dump32;
+        Dumper dump64;
+
+        // A function which builds a dispatcher for a semihosting call.
+        template <typename Abi, typename SemiImpl, typename... Args>
+        static inline Dispatcher
+        buildDispatcher(Implementation<SemiImpl, Args...> impl)
+        {
+            // This lambda is the dispatcher we're building.
+            return [impl](SemiImpl *sh, ThreadContext *tc) {
+                auto wrapper = wrapImpl<SemiImpl, Args...>(sh, impl);
+                return invokeSimcall<Abi>(tc, wrapper);
+            };
+        }
+
+        // A function which builds a dumper for a semihosting call.
+        template <typename Abi, typename SemiImpl, typename... Args>
+        static inline Dumper
+        buildDumper(const char *name, Implementation<SemiImpl, Args...> impl)
+        {
+            // This lambda is the dumper we're building.
+            return [name](ThreadContext *tc) -> std::string {
+                return dumpSimcall<Abi, RetErrno, Args...>(name, tc);
+            };
+        }
+
+        // When there's one implementation, use it for both 32 and 64 bits.
+        template <typename SemiImpl, typename... Args>
+        SemiCallBase(
+                const char *_name, Implementation<SemiImpl, Args...> common) :
+            name(_name), call32(buildDispatcher<Abi32>(common)),
+            call64(buildDispatcher<Abi64>(common)),
+            dump32(buildDumper<Abi32>(_name, common)),
+            dump64(buildDumper<Abi64>(_name, common))
+        {}
+
+        // When there are two, use one for 32 bits and one for 64 bits.
+        template <typename SemiImpl, typename... Args32, typename... Args64>
+        SemiCallBase(const char *_name,
+                Implementation<SemiImpl, Args32...> impl32,
+                Implementation<SemiImpl, Args64...> impl64) :
+            name(_name), call32(buildDispatcher<Abi32, SemiImpl>(impl32)),
+            call64(buildDispatcher<Abi64, SemiImpl>(impl64)),
+            dump32(buildDumper<Abi32, SemiImpl>(_name, impl32)),
+            dump64(buildDumper<Abi64, SemiImpl>(_name, impl64))
+        {}
+    };
+
+  public:
+    RetErrno callOpen(ThreadContext *tc, const Addr name_base, int fmode,
+            size_t name_size);
+    RetErrno callClose(ThreadContext *tc, Handle handle);
+    RetErrno callWriteC(ThreadContext *tc, InPlaceArg c);
+    RetErrno callWrite0(ThreadContext *tc, InPlaceArg str);
+    RetErrno callWrite(
+            ThreadContext *tc, Handle handle, Addr buffer, size_t size);
+    RetErrno callRead(
+            ThreadContext *tc, Handle handle, Addr buffer, size_t size);
+    RetErrno callReadC(ThreadContext *tc);
+    RetErrno callIsError(ThreadContext *tc, int64_t status);
+    RetErrno callIsTTY(ThreadContext *tc, Handle handle);
+    RetErrno callSeek(ThreadContext *tc, Handle handle, uint64_t pos);
+    RetErrno callFLen(ThreadContext *tc, Handle handle);
+    RetErrno callTmpNam(
+            ThreadContext *tc, Addr buffer, uint64_t id, size_t size);
+    RetErrno callRemove(ThreadContext *tc, Addr name_base, size_t name_size);
+    RetErrno callRename(ThreadContext *tc, Addr from_addr, size_t from_size,
+            Addr to_addr, size_t to_size);
+    RetErrno callClock(ThreadContext *tc);
+    RetErrno callTime(ThreadContext *tc);
+    RetErrno callSystem(ThreadContext *tc, Addr cmd_addr, size_t cmd_size);
+    RetErrno callErrno(ThreadContext *tc);
+    RetErrno callGetCmdLine(ThreadContext *tc, Addr addr, InPlaceArg size_arg);
+
+    void gatherHeapInfo(ThreadContext *tc, bool aarch64, Addr &heap_base,
+            Addr &heap_limit, Addr &stack_base, Addr &stack_limit);
+    RetErrno callHeapInfo32(ThreadContext *tc, Addr block_addr);
+    RetErrno callHeapInfo64(ThreadContext *tc, Addr block_addr);
+    RetErrno callExit32(ThreadContext *tc, InPlaceArg code);
+    RetErrno callExit64(ThreadContext *tc, uint64_t code, uint64_t subcode);
+    RetErrno callExitExtended(
+            ThreadContext *tc, uint64_t code, uint64_t subcode);
+
+    RetErrno callElapsed32(ThreadContext *tc, InPlaceArg low, InPlaceArg high);
+    RetErrno callElapsed64(ThreadContext *tc, InPlaceArg ticks);
+    RetErrno callTickFreq(ThreadContext *tc);
+
+    template <typename Abi>
+    void
+    unrecognizedCall(ThreadContext *tc, const char *format, uint64_t op)
+    {
+        warn(format, op);
+        std::function<RetErrno(ThreadContext * tc)> retErr =
+                [](ThreadContext *tc) { return retError(EINVAL); };
+        invokeSimcall<Abi>(tc, retErr);
+    }
+
+    static FILE *getSTDIO(const char *stream_name, const std::string &name,
+            const char *mode);
+
+    static const std::vector<const char *> fmodes;
+    static const std::map<uint64_t, const char *> exitCodes;
+    static const std::array<uint8_t, 5> features;
+    static const std::map<const std::string, FILE *> stdioMap;
+
+    // used in callTmpNam() to deterministically generate a temp filename
+    uint16_t tmpNameIndex = 0;
+};
+
+std::ostream &operator<<(
+        std::ostream &os, const BaseSemihosting::InPlaceArg &ipa);
+
+} // namespace gem5
+
+#endif // __ARCH_GENERIC_SEMIHOSTING_HH__
diff --git a/src/arch/generic/vec_reg.hh b/src/arch/generic/vec_reg.hh
index 349811f1f0..d643c9db5b 100644
--- a/src/arch/generic/vec_reg.hh
+++ b/src/arch/generic/vec_reg.hh
@@ -99,7 +99,9 @@
 
 #include <array>
 #include <cstdint>
+#include <iomanip>
 #include <iostream>
+#include <sstream>
 #include <string>
 
 #include "base/cprintf.hh"
@@ -218,6 +220,24 @@ class VecRegContainer
         return os;
     }
 
+    std::string
+    getString(const uint64_t& size) const
+    {
+        std::stringstream s;
+        size_t count = 0;
+        s << "[";
+        for (auto& b: container) {
+            if (count && (count % 4) == 0)
+                s << "_";
+            s << std::hex << std::setfill('0') << std::setw(2) << (uint16_t)b;
+            count++;
+            if (count == size)
+                break;
+        }
+        s << "]";
+        return s.str();
+    }
+
     /** @} */
     /**
      * Used for serialization.
diff --git a/src/arch/mips/isa.cc b/src/arch/mips/isa.cc
index 92799ab291..d32de51479 100644
--- a/src/arch/mips/isa.cc
+++ b/src/arch/mips/isa.cc
@@ -105,12 +105,12 @@ constexpr RegClass vecElemClass(VecElemClass, VecElemClassName, 2,
         debug::IntRegs);
 constexpr RegClass vecPredRegClass(VecPredRegClass, VecPredRegClassName, 1,
         debug::IntRegs);
-constexpr RegClass matRegClass(MatRegClass, MatRegClassName, 1, debug::MatRegs);
+constexpr RegClass matRegClass(MatRegClass, MatRegClassName, 0, debug::MatRegs);
 constexpr RegClass ccRegClass(CCRegClass, CCRegClassName, 0, debug::IntRegs);
 
 } // anonymous namespace
 
-ISA::ISA(const Params &p) : BaseISA(p), numThreads(p.num_threads),
+ISA::ISA(const Params &p) : BaseISA(p, "mips"), numThreads(p.num_threads),
     numVpes(p.num_vpes)
 {
     _regClasses.push_back(&intRegClass);
diff --git a/src/arch/mips/isa/formats/unimp.isa b/src/arch/mips/isa/formats/unimp.isa
index 198e20b820..776698f291 100644
--- a/src/arch/mips/isa/formats/unimp.isa
+++ b/src/arch/mips/isa/formats/unimp.isa
@@ -49,6 +49,7 @@ output header {{
             // don't call execute() (which panics) if we're on a
             // speculative path
             flags[IsNonSpeculative] = true;
+            flags[IsInvalid] = true;
         }
 
         Fault execute(ExecContext *, trace::InstRecord *) const override;
diff --git a/src/arch/mips/isa/formats/unknown.isa b/src/arch/mips/isa/formats/unknown.isa
index 8d3ccdfef1..782b4e1595 100644
--- a/src/arch/mips/isa/formats/unknown.isa
+++ b/src/arch/mips/isa/formats/unknown.isa
@@ -47,6 +47,7 @@ output header {{
             // don't call execute() (which panics) if we're on a
             // speculative path
             flags[IsNonSpeculative] = true;
+            flags[IsInvalid] = true;
         }
 
         Fault execute(ExecContext *, trace::InstRecord *) const override;
diff --git a/src/arch/power/isa.cc b/src/arch/power/isa.cc
index ecaebade9a..06caffbbf0 100644
--- a/src/arch/power/isa.cc
+++ b/src/arch/power/isa.cc
@@ -57,12 +57,12 @@ RegClass vecRegClass(VecRegClass, VecRegClassName, 1, debug::IntRegs);
 RegClass vecElemClass(VecElemClass, VecElemClassName, 2, debug::IntRegs);
 RegClass vecPredRegClass(VecPredRegClass, VecPredRegClassName, 1,
         debug::IntRegs);
-RegClass matRegClass(MatRegClass, MatRegClassName, 1, debug::MatRegs);
+RegClass matRegClass(MatRegClass, MatRegClassName, 0, debug::MatRegs);
 RegClass ccRegClass(CCRegClass, CCRegClassName, 0, debug::IntRegs);
 
 } // anonymous namespace
 
-ISA::ISA(const Params &p) : BaseISA(p)
+ISA::ISA(const Params &p) : BaseISA(p, "power")
 {
     _regClasses.push_back(&intRegClass);
     _regClasses.push_back(&floatRegClass);
diff --git a/src/arch/power/isa/formats/unimp.isa b/src/arch/power/isa/formats/unimp.isa
index 9e1e46051b..5e9579fa5e 100644
--- a/src/arch/power/isa/formats/unimp.isa
+++ b/src/arch/power/isa/formats/unimp.isa
@@ -51,6 +51,7 @@ output header {{
             // don't call execute() (which panics) if we're on a
             // speculative path
             flags[IsNonSpeculative] = true;
+            flags[IsInvalid] = true;
         }
 
         Fault execute(ExecContext *, trace::InstRecord *) const override;
diff --git a/src/arch/power/isa/formats/unknown.isa b/src/arch/power/isa/formats/unknown.isa
index 85dacc5796..78eac5ca8b 100644
--- a/src/arch/power/isa/formats/unknown.isa
+++ b/src/arch/power/isa/formats/unknown.isa
@@ -49,6 +49,7 @@ output header {{
             // don't call execute() (which panics) if we're on a
             // speculative path
             flags[IsNonSpeculative] = true;
+            flags[IsInvalid] = true;
         }
 
         Fault execute(ExecContext *, trace::InstRecord *) const override;
diff --git a/src/arch/riscv/PMAChecker.py b/src/arch/riscv/PMAChecker.py
index c456569b32..19b80b8fc2 100644
--- a/src/arch/riscv/PMAChecker.py
+++ b/src/arch/riscv/PMAChecker.py
@@ -40,9 +40,20 @@ from m5.proxy import *
 from m5.SimObject import SimObject
 
 
-class PMAChecker(SimObject):
+class BasePMAChecker(SimObject):
+    type = "BasePMAChecker"
+    cxx_header = "arch/riscv/pma_checker.hh"
+    cxx_class = "gem5::RiscvISA::BasePMAChecker"
+    abstract = True
+
+
+class PMAChecker(BasePMAChecker):
     type = "PMAChecker"
     cxx_header = "arch/riscv/pma_checker.hh"
-    cxx_class = "gem5::PMAChecker"
+    cxx_class = "gem5::RiscvISA::PMAChecker"
 
     uncacheable = VectorParam.AddrRange([], "Uncacheable address ranges")
+    misaligned = VectorParam.AddrRange(
+        [],
+        "Address ranges support misaligned load/store to memory",
+    )
diff --git a/src/arch/riscv/PMP.py b/src/arch/riscv/PMP.py
index dc0608d643..3fdb902dff 100644
--- a/src/arch/riscv/PMP.py
+++ b/src/arch/riscv/PMP.py
@@ -32,6 +32,6 @@ from m5.SimObject import SimObject
 class PMP(SimObject):
     type = "PMP"
     cxx_header = "arch/riscv/pmp.hh"
-    cxx_class = "gem5::PMP"
+    cxx_class = "gem5::RiscvISA::PMP"
 
     pmp_entries = Param.Int(16, "Maximum PMP Entries Supported")
diff --git a/src/arch/riscv/RiscvFsWorkload.py b/src/arch/riscv/RiscvFsWorkload.py
index 24dff58828..473bb6c280 100644
--- a/src/arch/riscv/RiscvFsWorkload.py
+++ b/src/arch/riscv/RiscvFsWorkload.py
@@ -27,6 +27,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from m5.objects.RiscvSemihosting import RiscvSemihosting
 from m5.objects.System import System
 from m5.objects.Workload import (
     KernelWorkload,
@@ -43,6 +44,15 @@ class RiscvBareMetal(Workload):
     bootloader = Param.String("File, that contains the bootloader code")
     bare_metal = Param.Bool(True, "Using Bare Metal Application?")
     reset_vect = Param.Addr(0x0, "Reset vector")
+    semihosting = Param.RiscvSemihosting(
+        NULL,
+        "Enable support for RISC-V semihosting by settings this parameter",
+    )
+    auto_reset_vect = Param.Bool(
+        True,
+        "Use bootloader entry point as reset vector. "
+        "If `auto_reset_vect` is true, the `reset_vect` parameter is ignored.",
+    )
 
 
 class RiscvLinux(KernelWorkload):
@@ -54,6 +64,10 @@ class RiscvLinux(KernelWorkload):
         "", "File that contains the Device Tree Blob. Don't use DTB if empty."
     )
     dtb_addr = Param.Addr(0x87E00000, "DTB address")
+    semihosting = Param.RiscvSemihosting(
+        NULL,
+        "Enable support for RISC-V semihosting by settings this parameter",
+    )
 
     # gem5 event upon guest's kernel panic
     # Default to false because when the kernel is compiled into the bootloader
@@ -119,3 +133,7 @@ class RiscvBootloaderKernelWorkload(Workload):
         "Define how gem5 should behave after a Linux Kernel Oops. "
         "Handler might not be implemented for all architectures.",
     )
+    semihosting = Param.RiscvSemihosting(
+        NULL,
+        "Enable support for RISC-V semihosting by settings this parameter",
+    )
diff --git a/src/arch/riscv/RiscvISA.py b/src/arch/riscv/RiscvISA.py
index 0b0e8ce5a9..27b7e5d372 100644
--- a/src/arch/riscv/RiscvISA.py
+++ b/src/arch/riscv/RiscvISA.py
@@ -92,9 +92,6 @@ class RiscvISA(BaseISA):
     cxx_class = "gem5::RiscvISA::ISA"
     cxx_header = "arch/riscv/isa.hh"
 
-    check_alignment = Param.Bool(
-        True, "whether to check memory access alignment"
-    )
     riscv_type = Param.RiscvType("RV64", "RV32 or RV64")
 
     enable_rvv = Param.Bool(True, "Enable vector extension")
diff --git a/src/arch/riscv/RiscvInterrupts.py b/src/arch/riscv/RiscvInterrupts.py
index ad64013a2e..a66344df51 100644
--- a/src/arch/riscv/RiscvInterrupts.py
+++ b/src/arch/riscv/RiscvInterrupts.py
@@ -2,6 +2,7 @@
 # Copyright (c) 2014 Sven Karlsson
 # Copyright (c) 2016 RISC-V Foundation
 # Copyright (c) 2016 The University of Virginia
+# Copyright (c) 2024 University of Rostock
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,10 +28,35 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from m5.citations import add_citation
 from m5.objects.BaseInterrupts import BaseInterrupts
+from m5.objects.IntPin import VectorIntSinkPin
+from m5.params import VectorParam
 
 
 class RiscvInterrupts(BaseInterrupts):
     type = "RiscvInterrupts"
     cxx_class = "gem5::RiscvISA::Interrupts"
     cxx_header = "arch/riscv/interrupts.hh"
+
+    local_interrupt_pins = VectorIntSinkPin("Pins for local interrupts")
+    local_interrupt_ids = VectorParam.Unsigned(
+        [], "list of local interrupt ids"
+    )
+
+
+add_citation(
+    RiscvInterrupts,
+    r"""@inproceedings{Hauser:2024:LocalRiscvInterrupts,
+    author = {Robert Hauser  and
+              Lukas Steffen and
+              Florian Gr{\"u}tzmacher and
+              Christian Haubelt},
+    title = {Analyzing Local RISC-V Interrupt Latencies with Virtual Prototyping},
+    booktitle = {Workshop Methoden und Beschreibungssprachen zur Modellierung und Verifikation von Schaltungen und Systemen (MBMV24)},
+    pages = {1-7},
+    year = {2024},
+    month = {2}
+    }
+    """,
+)
diff --git a/src/arch/riscv/RiscvMMU.py b/src/arch/riscv/RiscvMMU.py
index 6ef4182c88..6abe62b041 100644
--- a/src/arch/riscv/RiscvMMU.py
+++ b/src/arch/riscv/RiscvMMU.py
@@ -49,7 +49,7 @@ class RiscvMMU(BaseMMU):
 
     itb = RiscvTLB(entry_type="instruction")
     dtb = RiscvTLB(entry_type="data")
-    pma_checker = Param.PMAChecker(PMAChecker(), "PMA Checker")
+    pma_checker = Param.BasePMAChecker(PMAChecker(), "PMA Checker")
     pmp = Param.PMP(PMP(), "Physical Memory Protection Unit")
 
     @classmethod
diff --git a/src/dev/virtio/VirtIORng 2.py b/src/arch/riscv/RiscvSemihosting.py
similarity index 84%
rename from src/dev/virtio/VirtIORng 2.py
rename to src/arch/riscv/RiscvSemihosting.py
index 5d5db04992..cfcbf43fe9 100644
--- a/src/dev/virtio/VirtIORng 2.py	
+++ b/src/arch/riscv/RiscvSemihosting.py
@@ -1,7 +1,4 @@
-# -*- mode:python -*-
-
-# Copyright (c) 2022  Institute of Computing Technology, Chinese
-#                     Academy of Sciences
+# Copyright (c) 2018, 2019 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -36,14 +33,14 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from m5.objects.VirtIO import VirtIODeviceBase
+from m5.objects.BaseSemihosting import BaseSemihosting
+from m5.objects.Serial import SerialDevice
+from m5.objects.Terminal import Terminal
 from m5.params import *
-from m5.proxy import *
+from m5.SimObject import *
 
 
-class VirtIORng(VirtIODeviceBase):
-    type = "VirtIORng"
-    cxx_header = "dev/virtio/rng.hh"
-    cxx_class = "gem5::VirtIORng"
-
-    qSize = Param.Unsigned(16, "Request queue size")
+class RiscvSemihosting(BaseSemihosting):
+    type = "RiscvSemihosting"
+    cxx_header = "arch/riscv/semihosting.hh"
+    cxx_class = "gem5::RiscvSemihosting"
diff --git a/src/arch/riscv/RiscvTLB.py b/src/arch/riscv/RiscvTLB.py
index 05a1c71b19..ea9cbdeba6 100644
--- a/src/arch/riscv/RiscvTLB.py
+++ b/src/arch/riscv/RiscvTLB.py
@@ -45,7 +45,7 @@ class RiscvPagetableWalker(ClockedObject):
         4, "Number of outstanding walks that can be squashed per cycle"
     )
     # Grab the pma_checker from the MMU
-    pma_checker = Param.PMAChecker(Parent.any, "PMA Checker")
+    pma_checker = Param.BasePMAChecker(Parent.any, "PMA Checker")
     pmp = Param.PMP(Parent.any, "PMP")
 
 
@@ -59,5 +59,5 @@ class RiscvTLB(BaseTLB):
         RiscvPagetableWalker(), "page table walker"
     )
     # Grab the pma_checker from the MMU
-    pma_checker = Param.PMAChecker(Parent.any, "PMA Checker")
+    pma_checker = Param.BasePMAChecker(Parent.any, "PMA Checker")
     pmp = Param.PMP(Parent.any, "Physical Memory Protection Unit")
diff --git a/src/arch/riscv/SConscript b/src/arch/riscv/SConscript
index 41da97bcc1..55fcf46cb4 100644
--- a/src/arch/riscv/SConscript
+++ b/src/arch/riscv/SConscript
@@ -4,6 +4,7 @@
 # Copyright (c) 2014 Sven Karlsson
 # Copyright (c) 2020 Barkhausen Institut
 # Copyright (c) 2021 Huawei International
+# Copyright (c) 2024 University of Rostock
 # All rights reserved
 #
 # The license below extends only to copyright in the software and shall
@@ -48,6 +49,7 @@ if env['CONF']['USE_RISCV_ISA']:
 
 Source('decoder.cc', tags='riscv isa')
 Source('faults.cc', tags='riscv isa')
+Source('interrupts.cc', tags='riscv isa')
 Source('isa.cc', tags='riscv isa')
 Source('process.cc', tags='riscv isa')
 Source('pagetable.cc', tags='riscv isa')
@@ -56,6 +58,7 @@ Source('pma_checker.cc', tags='riscv isa')
 Source('pmp.cc', tags='riscv isa')
 Source('reg_abi.cc', tags='riscv isa')
 Source('remote_gdb.cc', tags='riscv isa')
+Source('semihosting.cc', tags='riscv isa')
 Source('tlb.cc', tags='riscv isa')
 
 Source('linux/se_workload.cc', tags='riscv isa')
@@ -63,7 +66,8 @@ Source('linux/fs_workload.cc', tags='riscv isa')
 
 Source('bare_metal/fs_workload.cc', tags='riscv isa')
 
-SimObject('PMAChecker.py', sim_objects=['PMAChecker'], tags='riscv isa')
+SimObject('PMAChecker.py', sim_objects=['PMAChecker', 'BasePMAChecker'],
+    tags='riscv isa')
 SimObject('PMP.py', sim_objects=['PMP'], tags='riscv isa')
 SimObject('RiscvDecoder.py', sim_objects=['RiscvDecoder'], tags='riscv isa')
 SimObject('RiscvFsWorkload.py',
@@ -75,6 +79,8 @@ SimObject('RiscvInterrupts.py', sim_objects=['RiscvInterrupts'],
 SimObject('RiscvISA.py', sim_objects=['RiscvISA'],
     enums=['RiscvType', 'PrivilegeModeSet'], tags='riscv isa')
 SimObject('RiscvMMU.py', sim_objects=['RiscvMMU'], tags='riscv isa')
+SimObject('RiscvSemihosting.py', sim_objects=['RiscvSemihosting'],
+          tags='riscv isa')
 SimObject('RiscvSeWorkload.py', sim_objects=[
     'RiscvSEWorkload', 'RiscvEmuLinux'], tags='riscv isa')
 SimObject('RiscvTLB.py', sim_objects=['RiscvPagetableWalker', 'RiscvTLB'],
diff --git a/src/arch/riscv/bare_metal/fs_workload.cc b/src/arch/riscv/bare_metal/fs_workload.cc
index 574c944bff..b596e5ce4c 100644
--- a/src/arch/riscv/bare_metal/fs_workload.cc
+++ b/src/arch/riscv/bare_metal/fs_workload.cc
@@ -41,13 +41,19 @@ namespace RiscvISA
 {
 
 BareMetal::BareMetal(const Params &p) : Workload(p),
-    _isBareMetal(p.bare_metal), _resetVect(p.reset_vect),
-    bootloader(loader::createObjectFile(p.bootloader))
+    _isBareMetal(p.bare_metal),
+    bootloader(loader::createObjectFile(p.bootloader)),
+    semihosting(p.semihosting)
 {
     fatal_if(!bootloader, "Could not load bootloader file %s.", p.bootloader);
-    _resetVect = bootloader->entryPoint();
     bootloaderSymtab = bootloader->symtab();
 
+    if (p.auto_reset_vect) {
+        _resetVect = bootloader->entryPoint();
+    } else {
+        _resetVect = p.reset_vect;
+    }
+
     loader::debugSymbolTable.insert(bootloaderSymtab);
 }
 
diff --git a/src/arch/riscv/bare_metal/fs_workload.hh b/src/arch/riscv/bare_metal/fs_workload.hh
index 35f42555df..5b5725b85d 100644
--- a/src/arch/riscv/bare_metal/fs_workload.hh
+++ b/src/arch/riscv/bare_metal/fs_workload.hh
@@ -30,6 +30,7 @@
 #define __ARCH_RISCV_BARE_METAL_SYSTEM_HH__
 
 #include "arch/riscv/remote_gdb.hh"
+#include "arch/riscv/semihosting.hh"
 #include "params/RiscvBareMetal.hh"
 #include "sim/workload.hh"
 
@@ -48,6 +49,7 @@ class BareMetal : public Workload
     Addr _resetVect;
     loader::ObjectFile *bootloader;
     loader::SymbolTable bootloaderSymtab;
+    RiscvSemihosting* semihosting;
 
   public:
     PARAMS(RiscvBareMetal);
@@ -86,6 +88,7 @@ class BareMetal : public Workload
     bool isBareMetal() const { return _isBareMetal; }
 
     Addr getEntry() const override { return _resetVect; }
+    RiscvSemihosting *getSemihosting() const override { return semihosting; }
 };
 
 } // namespace RiscvISA
diff --git a/src/arch/riscv/faults.cc b/src/arch/riscv/faults.cc
index 2e583e3680..5d82750914 100644
--- a/src/arch/riscv/faults.cc
+++ b/src/arch/riscv/faults.cc
@@ -61,8 +61,8 @@ RiscvFault::invoke(ThreadContext *tc, const StaticInstPtr &inst)
 {
     auto pc_state = tc->pcState().as<PCState>();
 
-    DPRINTFS(Faults, tc->getCpuPtr(), "Fault (%s) at PC: %s\n",
-             name(), pc_state);
+    DPRINTFS(Faults, tc->getCpuPtr(), "Fault (%s, %u) at PC: %s\n",
+             name(), exception(), pc_state);
 
     if (FullSystem) {
         PrivilegeMode pp = (PrivilegeMode)tc->readMiscReg(MISCREG_PRV);
@@ -248,5 +248,23 @@ SyscallFault::invokeSE(ThreadContext *tc, const StaticInstPtr &inst)
     tc->getSystemPtr()->workload->syscall(tc);
 }
 
+bool
+getFaultVAddr(Fault fault, Addr &va)
+{
+    auto addr_fault = dynamic_cast<AddressFault *>(fault.get());
+    if (addr_fault) {
+        va = addr_fault->trap_value();
+        return true;
+    }
+
+    auto pgt_fault = dynamic_cast<GenericPageTableFault *>(fault.get());
+    if (pgt_fault) {
+        va = pgt_fault->getFaultVAddr();
+        return true;
+    }
+
+    return false;
+}
+
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/faults.hh b/src/arch/riscv/faults.hh
index fa67e3b34c..6f4245093f 100644
--- a/src/arch/riscv/faults.hh
+++ b/src/arch/riscv/faults.hh
@@ -2,6 +2,7 @@
  * Copyright (c) 2016 RISC-V Foundation
  * Copyright (c) 2016 The University of Virginia
  * Copyright (c) 2018 TU Dresden
+ * Copyright (c) 2024 University of Rostock
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -93,6 +94,54 @@ enum ExceptionCode : uint64_t
     INT_EXT_USER = 8,
     INT_EXT_SUPER = 9,
     INT_EXT_MACHINE = 11,
+    INT_LOCAL_0 = 16,
+    INT_LOCAL_1 = 17,
+    INT_LOCAL_2 = 18,
+    INT_LOCAL_3 = 19,
+    INT_LOCAL_4 = 20,
+    INT_LOCAL_5 = 21,
+    INT_LOCAL_6 = 22,
+    INT_LOCAL_7 = 23,
+    INT_LOCAL_8 = 24,
+    INT_LOCAL_9 = 25,
+    INT_LOCAL_10 = 26,
+    INT_LOCAL_11 = 27,
+    INT_LOCAL_12 = 28,
+    INT_LOCAL_13 = 29,
+    INT_LOCAL_14 = 30,
+    INT_LOCAL_15 = 31,
+    INT_LOCAL_16 = 32,
+    INT_LOCAL_17 = 33,
+    INT_LOCAL_18 = 34,
+    INT_LOCAL_19 = 35,
+    INT_LOCAL_20 = 36,
+    INT_LOCAL_21 = 37,
+    INT_LOCAL_22 = 38,
+    INT_LOCAL_23 = 39,
+    INT_LOCAL_24 = 40,
+    INT_LOCAL_25 = 41,
+    INT_LOCAL_26 = 42,
+    INT_LOCAL_27 = 43,
+    INT_LOCAL_28 = 44,
+    INT_LOCAL_29 = 45,
+    INT_LOCAL_30 = 46,
+    INT_LOCAL_31 = 47,
+    INT_LOCAL_32 = 48,
+    INT_LOCAL_33 = 49,
+    INT_LOCAL_34 = 50,
+    INT_LOCAL_35 = 51,
+    INT_LOCAL_36 = 52,
+    INT_LOCAL_37 = 53,
+    INT_LOCAL_38 = 54,
+    INT_LOCAL_39 = 55,
+    INT_LOCAL_40 = 56,
+    INT_LOCAL_41 = 57,
+    INT_LOCAL_42 = 58,
+    INT_LOCAL_43 = 59,
+    INT_LOCAL_44 = 60,
+    INT_LOCAL_45 = 61,
+    INT_LOCAL_46 = 62,
+    INT_LOCAL_47 = 63,
     NumInterruptTypes,
     // INT_NMI does not exist in the spec, it's a modeling artifact for NMI. We
     // intentionally set it to be NumInterruptTypes so it can never conflict
@@ -281,6 +330,18 @@ class SyscallFault : public RiscvFault
     void invokeSE(ThreadContext *tc, const StaticInstPtr &inst) override;
 };
 
+/**
+ * Returns true if the fault passed as a first argument was triggered
+ * by a memory access, false otherwise.
+ * If true it is storing the faulting address in the va argument
+ *
+ * @param fault generated fault
+ * @param va function will modify this passed-by-reference parameter
+ *           with the correct faulting virtual address
+ * @return true if va contains a valid value, false otherwise
+ */
+bool getFaultVAddr(Fault fault, Addr &va);
+
 } // namespace RiscvISA
 } // namespace gem5
 
diff --git a/src/arch/riscv/insts/standard.cc b/src/arch/riscv/insts/standard.cc
index 14e8fe26fd..6b996701c6 100644
--- a/src/arch/riscv/insts/standard.cc
+++ b/src/arch/riscv/insts/standard.cc
@@ -33,8 +33,10 @@
 #include <sstream>
 #include <string>
 
+#include "arch/riscv/faults.hh"
 #include "arch/riscv/insts/static_inst.hh"
 #include "arch/riscv/regs/misc.hh"
+#include "arch/riscv/semihosting.hh"
 #include "arch/riscv/utility.hh"
 #include "cpu/static_inst.hh"
 
@@ -87,5 +89,21 @@ SystemOp::generateDisassembly(Addr pc, const loader::SymbolTable *symtab) const
     return mnemonic;
 }
 
+Fault
+SystemOp::executeEBreakOrSemihosting(ExecContext *xc) const
+{
+    // If semihosting is enabled, we may need to execute a semihosting
+    // operation instead of raising a breakpoint fault.
+    ThreadContext *tc = xc->tcBase();
+    if (auto *semihosting = dynamic_cast<RiscvSemihosting *>(
+                tc->getSystemPtr()->workload->getSemihosting())) {
+        if (semihosting->isSemihostingEBreak(xc) && semihosting->call(tc)) {
+            return NoFault;
+        }
+    }
+    // No semihosting, raise a standard breakpoint exception.
+    return std::make_shared<BreakpointFault>(xc->pcState());
+}
+
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/insts/standard.hh b/src/arch/riscv/insts/standard.hh
index 2e7ae8d1d8..4f1a538c73 100644
--- a/src/arch/riscv/insts/standard.hh
+++ b/src/arch/riscv/insts/standard.hh
@@ -80,6 +80,9 @@ class SystemOp : public RiscvStaticInst
 
     std::string generateDisassembly(
         Addr pc, const loader::SymbolTable *symtab) const override;
+
+  protected:
+    Fault executeEBreakOrSemihosting(ExecContext *xc) const;
 };
 
 /**
diff --git a/src/arch/riscv/insts/static_inst.cc b/src/arch/riscv/insts/static_inst.cc
index fc615c8d31..db814dacc5 100644
--- a/src/arch/riscv/insts/static_inst.cc
+++ b/src/arch/riscv/insts/static_inst.cc
@@ -40,19 +40,6 @@ namespace gem5
 namespace RiscvISA
 {
 
-bool
-RiscvStaticInst::alignmentOk(ExecContext* xc, Addr addr, Addr size) const
-{
-    if (addr % size == 0) {
-        return true;
-    }
-    // Even if it's not aligned, we're still fine if the check is not enabled.
-    // We perform the check first because detecting whether the check itself is
-    // enabled involves multiple indirect references and is quite slow.
-    auto *isa = static_cast<ISA*>(xc->tcBase()->getIsaPtr());
-    return !isa->alignmentCheckEnabled();
-}
-
 void
 RiscvMicroInst::advancePC(PCStateBase &pcState) const
 {
diff --git a/src/arch/riscv/insts/static_inst.hh b/src/arch/riscv/insts/static_inst.hh
index 2e4d94864a..8ccb9a7e65 100644
--- a/src/arch/riscv/insts/static_inst.hh
+++ b/src/arch/riscv/insts/static_inst.hh
@@ -57,8 +57,6 @@ class RiscvStaticInst : public StaticInst
         StaticInst(_mnemonic, __opClass), machInst(_machInst)
     {}
 
-    bool alignmentOk(ExecContext* xc, Addr addr, Addr size) const;
-
     template <typename T>
     T
     rvSelect(T v32, T v64) const
diff --git a/src/arch/riscv/insts/unknown.hh b/src/arch/riscv/insts/unknown.hh
index 64f94dea00..ca90c453f5 100644
--- a/src/arch/riscv/insts/unknown.hh
+++ b/src/arch/riscv/insts/unknown.hh
@@ -54,7 +54,9 @@ class Unknown : public RiscvStaticInst
   public:
     Unknown(ExtMachInst _machInst)
         : RiscvStaticInst("unknown", _machInst, No_OpClass)
-    {}
+    {
+        flags[IsInvalid] = true;
+    }
 
     Fault
     execute(ExecContext *, trace::InstRecord *) const override
diff --git a/src/arch/riscv/insts/vector.cc b/src/arch/riscv/insts/vector.cc
index 7f17bb055e..d73fb93882 100644
--- a/src/arch/riscv/insts/vector.cc
+++ b/src/arch/riscv/insts/vector.cc
@@ -414,7 +414,7 @@ VMvWholeMicroInst::generateDisassembly(Addr pc,
 VMaskMergeMicroInst::VMaskMergeMicroInst(ExtMachInst extMachInst,
     uint8_t _dstReg, uint8_t _numSrcs, uint32_t _vlen, size_t _elemSize)
     : VectorArithMicroInst("vmask_mv_micro", extMachInst,
-                            VectorIntegerArithOp, 0, 0),
+                            SimdAddOp, 0, 0),
       vlen(_vlen),
       elemSize(_elemSize)
 {
@@ -464,8 +464,9 @@ VMaskMergeMicroInst::execute(ExecContext* xc,
             memcpy(Vd + i * byte_offset, s + i * byte_offset, byte_offset);
         }
     }
-    if (traceData)
+    if (traceData) {
         traceData->setData(vecRegClass, &tmp_d0);
+    }
     return NoFault;
 }
 
@@ -501,5 +502,300 @@ VxsatMicroInst::generateDisassembly(Addr pc,
     return ss.str();
 }
 
+VlFFTrimVlMicroOp::VlFFTrimVlMicroOp(ExtMachInst _machInst, uint32_t _microVl,
+    uint32_t _microIdx, uint32_t _vlen, std::vector<StaticInstPtr>& _microops)
+    : VectorMicroInst("vlff_trimvl_v_micro", _machInst, SimdConfigOp,
+                      _microVl, _microIdx, _vlen),
+      microops(_microops)
+{
+    setRegIdxArrays(
+        reinterpret_cast<RegIdArrayPtr>(
+            &std::remove_pointer_t<decltype(this)>::srcRegIdxArr),
+        nullptr
+    );
+
+    // Create data dependency with load micros
+    for (uint8_t i=0; i<microIdx; i++) {
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + i]);
+    }
+
+    this->flags[IsControl] = true;
+    this->flags[IsIndirectControl] = true;
+    this->flags[IsInteger] = true;
+    this->flags[IsUncondControl] = true;
+}
+
+uint32_t
+VlFFTrimVlMicroOp::calcVl() const
+{
+    uint32_t vl = 0;
+    for (uint8_t i=0; i<microIdx; i++) {
+        VleMicroInst& micro = static_cast<VleMicroInst&>(*microops[i]);
+        vl += micro.faultIdx;
+
+        if (micro.trimVl)
+            break;
+    }
+    return vl;
+}
+
+Fault
+VlFFTrimVlMicroOp::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    auto tc = xc->tcBase();
+    MISA misa = xc->readMiscReg(MISCREG_ISA);
+    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+    if (!misa.rvv || status.vs == VPUStatus::OFF) {
+        return std::make_shared<IllegalInstFault>(
+                "RVV is disabled or VPU is off", machInst);
+    }
+
+    PCState pc;
+    set(pc, xc->pcState());
+
+    uint32_t new_vl = calcVl();
+
+    tc->setMiscReg(MISCREG_VSTART, 0);
+
+    RegVal final_val = new_vl;
+    if (traceData) {
+        traceData->setData(miscRegClass, final_val);
+    }
+
+    pc.vl(new_vl);
+    xc->pcState(pc);
+
+    return NoFault;
+}
+
+std::unique_ptr<PCStateBase>
+VlFFTrimVlMicroOp::branchTarget(ThreadContext *tc) const
+{
+    PCStateBase *pc_ptr = tc->pcState().clone();
+
+    uint32_t new_vl = calcVl();
+
+    pc_ptr->as<PCState>().vl(new_vl);
+    return std::unique_ptr<PCStateBase>{pc_ptr};
+}
+
+std::string
+VlFFTrimVlMicroOp::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << " vl";
+    return ss.str();
+}
+
+std::string VlSegMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", " << registerName(srcRegIdx(1));
+    if (!machInst.vm)
+        ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VlSegMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", "<< registerName(srcRegIdx(1));
+    if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0)
+        ss << ", " << registerName(srcRegIdx(2));
+    if (!machInst.vm)
+        ss << ", v0.t";
+    return ss.str();
+}
+
+VlSegDeIntrlvMicroInst::VlSegDeIntrlvMicroInst(ExtMachInst extMachInst, uint32_t _micro_vl,
+                        uint32_t _dstReg, uint32_t _numSrcs,
+                        uint32_t _microIdx, uint32_t _numMicroops,
+                        uint32_t _field, uint32_t _vlen, uint32_t _sizeOfElement)
+    : VectorArithMicroInst("vlseg_deintrlv_micro", extMachInst,
+                            SimdAddOp, 0, 0),
+        vlen(_vlen)
+{
+    setRegIdxArrays(
+        reinterpret_cast<RegIdArrayPtr>(
+            &std::remove_pointer_t<decltype(this)>::srcRegIdxArr),
+        reinterpret_cast<RegIdArrayPtr>(
+            &std::remove_pointer_t<decltype(this)>::destRegIdxArr));
+
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    numSrcs = _numSrcs;
+    numMicroops = _numMicroops;
+    field =_field;
+    sizeOfElement = _sizeOfElement;
+    microIdx = _microIdx;
+    micro_vl = _micro_vl;
+
+    setDestRegIdx(_numDestRegs++, vecRegClass[_dstReg]);
+    _numTypedDestRegs[VecRegClass]++;
+    for (uint32_t i=0; i < _numSrcs; i++) {
+        uint32_t index = VecMemInternalReg0 + i + (microIdx * _numSrcs);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[index]);
+    }
+}
+
+Fault
+VlSegDeIntrlvMicroInst::execute(ExecContext* xc, trace::InstRecord* traceData) const
+{
+    vreg_t& tmp_d0 = *(vreg_t *)xc->getWritableRegOperand(this, 0);
+    auto Vd = tmp_d0.as<uint8_t>();
+    const uint32_t elems_per_vreg =  micro_vl;
+    vreg_t tmp_s;
+    auto s = tmp_s.as<uint8_t>();
+    uint32_t elem = 0;
+    uint32_t index = field;
+    for (uint32_t i = 0; i < numSrcs; i++) {
+        xc->getRegOperand(this, i, &tmp_s);
+        s = tmp_s.as<uint8_t>();
+        while(index < (i + 1) * elems_per_vreg)
+        {
+            memcpy(Vd + (elem * sizeOfElement),
+                    s + ((index  %  elems_per_vreg) * sizeOfElement),
+                    sizeOfElement);
+            index += numSrcs;
+            elem++;
+        }
+    }
+    if (traceData) {
+        traceData->setData(vecRegClass, &tmp_d0);
+    }
+    return NoFault;
+}
+
+std::string
+VlSegDeIntrlvMicroInst::generateDisassembly(Addr pc, const loader::SymbolTable *symtab)
+    const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0));
+    for (uint8_t i = 0; i < this->_numSrcRegs; i++) {
+        ss << ", " << registerName(srcRegIdx(i));
+    }
+    ss << ", field: " << field;
+    return ss.str();
+}
+
+std::string VsSegMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", " << registerName(srcRegIdx(1));
+    if (!machInst.vm)
+        ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsSegMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", "<< registerName(srcRegIdx(1));
+    if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0)
+        ss << ", " << registerName(srcRegIdx(2));
+    if (!machInst.vm)
+        ss << ", v0.t";
+    return ss.str();
+}
+
+VsSegIntrlvMicroInst::VsSegIntrlvMicroInst(ExtMachInst extMachInst, uint32_t _micro_vl,
+                        uint32_t _dstReg, uint32_t _numSrcs,
+                        uint32_t _microIdx, uint32_t _numMicroops,
+                        uint32_t _field, uint32_t _vlen, uint32_t _sizeOfElement)
+    : VectorArithMicroInst("vsseg_reintrlv_micro", extMachInst,
+                            SimdAddOp, 0, 0),
+        vlen(_vlen)
+{
+    setRegIdxArrays(
+        reinterpret_cast<RegIdArrayPtr>(
+            &std::remove_pointer_t<decltype(this)>::srcRegIdxArr),
+        reinterpret_cast<RegIdArrayPtr>(
+            &std::remove_pointer_t<decltype(this)>::destRegIdxArr));
+
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    numSrcs = _numSrcs;
+    numMicroops = _numMicroops;
+    field =_field;
+    sizeOfElement = _sizeOfElement;
+    microIdx = _microIdx;
+    micro_vl = _micro_vl;
+
+    setDestRegIdx(_numDestRegs++, vecRegClass[VecMemInternalReg0 + field +
+        (_microIdx * numSrcs)]);
+
+    _numTypedDestRegs[VecRegClass]++;
+    for (uint8_t i=0; i<_numSrcs; i++) {
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_dstReg + (i * numMicroops) +
+            (microIdx)]);
+    }
+}
+
+Fault
+VsSegIntrlvMicroInst::execute(ExecContext* xc,
+    trace::InstRecord* traceData) const
+{
+    const uint32_t elems_per_vreg = micro_vl;
+    vreg_t& tmp_d0 = *(vreg_t *)xc->getWritableRegOperand(this, 0);
+    auto Vd = tmp_d0.as<uint8_t>();
+
+    vreg_t tmp_s;
+    auto s = tmp_s.as<uint8_t>();
+    xc->getRegOperand(this, 0, &tmp_s);
+    s = tmp_s.as<uint8_t>();
+
+    uint32_t indexVd = 0;
+    uint32_t srcReg = (field * elems_per_vreg) % numSrcs;
+    uint32_t indexs = (field * elems_per_vreg) / numSrcs;
+
+    while (indexVd < elems_per_vreg) {
+        xc->getRegOperand(this, srcReg, &tmp_s);
+        s = tmp_s.as<uint8_t>();
+
+        memcpy(Vd + (indexVd * sizeOfElement),
+                    s + (indexs * sizeOfElement),
+                    sizeOfElement);
+
+        indexVd++;
+        srcReg++;
+        if (srcReg >= numSrcs) {
+            srcReg = 0;
+            indexs++;
+        }
+    }
+
+    if (traceData) {
+        traceData->setData(vecRegClass, &tmp_d0);
+    }
+    return NoFault;
+}
+
+std::string
+VsSegIntrlvMicroInst::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0));
+    for (uint8_t i = 0; i < this->_numSrcRegs; i++) {
+        ss << ", " << registerName(srcRegIdx(i));
+    }
+    ss << ", field: " << field;
+    return ss.str();
+}
+
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/insts/vector.hh b/src/arch/riscv/insts/vector.hh
index fb36f0809d..07084fab36 100644
--- a/src/arch/riscv/insts/vector.hh
+++ b/src/arch/riscv/insts/vector.hh
@@ -31,6 +31,7 @@
 
 #include <string>
 
+#include "arch/riscv/faults.hh"
 #include "arch/riscv/insts/static_inst.hh"
 #include "arch/riscv/isa.hh"
 #include "arch/riscv/regs/misc.hh"
@@ -306,6 +307,10 @@ class VseMacroInst : public VectorMemMacroInst
 
 class VleMicroInst : public VectorMicroInst
 {
+  public:
+    mutable bool trimVl;
+    mutable uint32_t faultIdx;
+
   protected:
     Request::Flags memAccessFlags;
 
@@ -313,6 +318,7 @@ class VleMicroInst : public VectorMicroInst
                   uint32_t _microVl, uint32_t _microIdx, uint32_t _vlen)
         : VectorMicroInst(mnem, _machInst, __opClass, _microVl,
                             _microIdx, _vlen)
+        , trimVl(false), faultIdx(_microVl)
     {
         this->flags[IsLoad] = true;
     }
@@ -563,7 +569,7 @@ class VxsatMicroInst : public VectorArithMicroInst
   public:
     VxsatMicroInst(bool* Vxsat, ExtMachInst extMachInst)
         : VectorArithMicroInst("vxsat_micro", extMachInst,
-          VectorIntegerArithOp, 0, 0)
+          SimdMiscOp, 0, 0)
     {
         vxsat = Vxsat;
     }
@@ -572,6 +578,142 @@ class VxsatMicroInst : public VectorArithMicroInst
         const override;
 };
 
+class VlFFTrimVlMicroOp : public VectorMicroInst
+{
+  private:
+    RegId srcRegIdxArr[8];
+    RegId destRegIdxArr[0];
+    std::vector<StaticInstPtr>& microops;
+
+  public:
+    VlFFTrimVlMicroOp(ExtMachInst _machInst, uint32_t _microVl,
+        uint32_t _microIdx, uint32_t _vlen,
+        std::vector<StaticInstPtr>& _microops);
+    uint32_t calcVl() const;
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    std::unique_ptr<PCStateBase> branchTarget(ThreadContext *) const override;
+    std::string generateDisassembly(Addr, const loader::SymbolTable *)
+        const override;
+};
+
+class VlSegMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VlSegMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass, uint32_t _vlen)
+        : VectorMemMacroInst(mnem, _machInst, __opClass, _vlen)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlSegMicroInst : public VectorMicroInst
+{
+  protected:
+    Request::Flags memAccessFlags;
+    uint8_t regIdx;
+
+    VlSegMicroInst(const char *mnem, ExtMachInst _machInst,
+                   OpClass __opClass, uint32_t _microVl,
+                   uint32_t _microIdx, uint32_t _numMicroops,
+                   uint32_t _field, uint32_t _numFields,
+                   uint32_t _vlen)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl,
+                          _microIdx, _vlen)
+    {
+      this->flags[IsLoad] = true;
+    }
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlSegDeIntrlvMicroInst : public VectorArithMicroInst
+{
+  private:
+    RegId srcRegIdxArr[NumVecInternalRegs];
+    RegId destRegIdxArr[1];
+    uint32_t numSrcs;
+    uint32_t numMicroops;
+    uint32_t field;
+    uint32_t sizeOfElement;
+    uint32_t micro_vl;
+
+  public:
+    uint32_t vlen;
+
+    VlSegDeIntrlvMicroInst(ExtMachInst extMachInst, uint32_t _micro_vl,
+                            uint32_t _dstReg, uint32_t _numSrcs,
+                            uint32_t _microIdx, uint32_t _numMicroops,
+                            uint32_t _field, uint32_t _vlen,
+                            uint32_t _sizeOfElement);
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+
+    std::string generateDisassembly(Addr,
+        const loader::SymbolTable *)  const override;
+};
+
+class VsSegMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VsSegMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass, uint32_t _vlen)
+        : VectorMemMacroInst(mnem, _machInst, __opClass, _vlen)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsSegMicroInst : public VectorMicroInst
+{
+  protected:
+    Request::Flags memAccessFlags;
+    uint8_t regIdx;
+
+    VsSegMicroInst(const char *mnem, ExtMachInst _machInst,
+                   OpClass __opClass, uint32_t _microVl,
+                   uint32_t _microIdx, uint32_t _numMicroops,
+                   uint32_t _field, uint32_t _numFields,
+                   uint32_t _vlen)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl,
+                          _microIdx, _vlen)
+    {
+      this->flags[IsStore] = true;
+    }
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsSegIntrlvMicroInst : public VectorArithMicroInst
+{
+  private:
+    RegId srcRegIdxArr[NumVecInternalRegs];
+    RegId destRegIdxArr[1];
+    uint32_t numSrcs;
+    uint32_t numMicroops;
+    uint32_t field;
+    uint32_t sizeOfElement;
+    uint32_t micro_vl;
+
+  public:
+    uint32_t vlen;
+
+    VsSegIntrlvMicroInst(ExtMachInst extMachInst, uint32_t _micro_vl,
+                            uint32_t _dstReg, uint32_t _numSrcs,
+                            uint32_t _microIdx, uint32_t _numMicroops,
+                            uint32_t _field, uint32_t _vlen,
+                            uint32_t _sizeOfElement);
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+
+    std::string generateDisassembly(Addr,
+        const loader::SymbolTable *)  const override;
+};
+
 } // namespace RiscvISA
 } // namespace gem5
 
diff --git a/src/arch/riscv/interrupts.cc b/src/arch/riscv/interrupts.cc
new file mode 100644
index 0000000000..a45cae65e7
--- /dev/null
+++ b/src/arch/riscv/interrupts.cc
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2011 Google
+ * Copyright (c) 2024 University of Rostock
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "arch/riscv/interrupts.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+Interrupts::Interrupts(const Params &p) : BaseInterrupts(p),
+                                          ip(0),
+                                          ie(0)
+{
+    for (uint8_t i = 0;
+        i < p.port_local_interrupt_pins_connection_count;
+        ++i) {
+            uint8_t interruptID = p.local_interrupt_ids[i];
+            assert(interruptID <= 47);
+            std::string pinName =
+                csprintf("%s.local_interrupt_pins[%d]", p.name, i);
+            IntSinkPin<Interrupts>* pin =
+                new IntSinkPin<Interrupts>(pinName,i, this, interruptID);
+            localInterruptPins.push_back(pin);
+        }
+}
+
+
+std::bitset<NumInterruptTypes>
+Interrupts::globalMask() const
+{
+    INTERRUPT mask = 0;
+    STATUS status = tc->readMiscReg(MISCREG_STATUS);
+    MISA misa = tc->readMiscRegNoEffect(MISCREG_ISA);
+    INTERRUPT mideleg = 0;
+    if (misa.rvs || misa.rvn) {
+        mideleg = tc->readMiscReg(MISCREG_MIDELEG);
+    }
+    INTERRUPT sideleg = 0;
+    if (misa.rvs && misa.rvn) {
+        sideleg = tc->readMiscReg(MISCREG_SIDELEG);
+    }
+    PrivilegeMode prv = (PrivilegeMode)tc->readMiscReg(MISCREG_PRV);
+    switch (prv) {
+        case PRV_U:
+            // status.uie is always 0 if misa.rvn is disabled
+            if (misa.rvs) {
+                mask.local = ~sideleg.local;
+                if (status.uie)
+                    mask.local = mask.local | sideleg.local;
+                mask.mei = (!sideleg.mei) | (sideleg.mei & status.uie);
+                mask.mti = (!sideleg.mti) | (sideleg.mti & status.uie);
+                mask.msi = (!sideleg.msi) | (sideleg.msi & status.uie);
+                mask.sei = (!sideleg.sei) | (sideleg.sei & status.uie);
+                mask.sti = (!sideleg.sti) | (sideleg.sti & status.uie);
+                mask.ssi = (!sideleg.ssi) | (sideleg.ssi & status.uie);
+            } else {
+                // According to the RISC-V privilege spec v1.10, if the
+                // S privilege mode is not implemented and user-trap
+                // support, setting mideleg/medeleg bits will delegate the
+                // trap to U-mode trap handler
+                mask.local = ~mideleg.local;
+                if (status.uie)
+                    mask.local = mask.local | mideleg.local;
+                mask.mei = (!mideleg.mei) | (mideleg.mei & status.uie);
+                mask.mti = (!mideleg.mti) | (mideleg.mti & status.uie);
+                mask.msi = (!mideleg.msi) | (mideleg.msi & status.uie);
+                mask.sei = mask.sti = mask.ssi = 0;
+            }
+            if (status.uie)
+                mask.uei = mask.uti = mask.usi = 1;
+            break;
+        case PRV_S:
+            mask.local = ~mideleg.local;
+            mask.mei = (!mideleg.mei) | (mideleg.mei & status.sie);
+            mask.mti = (!mideleg.mti) | (mideleg.mti & status.sie);
+            mask.msi = (!mideleg.msi) | (mideleg.msi & status.sie);
+            if (status.sie) {
+                mask.sei = mask.sti = mask.ssi = 1;
+                mask.local = mask.local | mideleg.local;
+            }
+            mask.uei = mask.uti = mask.usi = 0;
+            break;
+        case PRV_M:
+
+            if (status.mie) {
+                    mask.local = gem5::mask(48);
+                    mask.mei = mask.mti = mask.msi = 1;
+            }
+            mask.sei = mask.sti = mask.ssi = 0;
+            mask.uei = mask.uti = mask.usi = 0;
+            break;
+        default:
+            panic("Unknown privilege mode %d.", prv);
+            break;
+    }
+
+    return std::bitset<NumInterruptTypes>(mask);
+}
+
+Fault
+Interrupts::getInterrupt()
+{
+    assert(checkInterrupts());
+    if (checkNonMaskableInterrupt())
+        return std::make_shared<NonMaskableInterruptFault>();
+    std::bitset<NumInterruptTypes> mask = globalMask();
+    if (((ISA*) tc->getIsaPtr())->rvType() == RV64) {
+        const std::vector<int> interrupt_order {
+            INT_LOCAL_47, INT_LOCAL_46, INT_LOCAL_45, INT_LOCAL_44,
+            INT_LOCAL_43, INT_LOCAL_42, INT_LOCAL_41, INT_LOCAL_40,
+            INT_LOCAL_39, INT_LOCAL_38, INT_LOCAL_37, INT_LOCAL_36,
+            INT_LOCAL_35, INT_LOCAL_34, INT_LOCAL_33, INT_LOCAL_32,
+            INT_LOCAL_31, INT_LOCAL_30, INT_LOCAL_29, INT_LOCAL_28,
+            INT_LOCAL_27, INT_LOCAL_26, INT_LOCAL_25, INT_LOCAL_24,
+            INT_LOCAL_23, INT_LOCAL_22, INT_LOCAL_21, INT_LOCAL_20,
+            INT_LOCAL_19, INT_LOCAL_18, INT_LOCAL_17, INT_LOCAL_16,
+            INT_LOCAL_15, INT_LOCAL_14, INT_LOCAL_13, INT_LOCAL_12,
+            INT_LOCAL_11, INT_LOCAL_10, INT_LOCAL_9, INT_LOCAL_8,
+            INT_LOCAL_7, INT_LOCAL_6, INT_LOCAL_5, INT_LOCAL_4,
+            INT_LOCAL_3, INT_LOCAL_2, INT_LOCAL_1, INT_LOCAL_0,
+            INT_EXT_MACHINE, INT_SOFTWARE_MACHINE, INT_TIMER_MACHINE,
+            INT_EXT_SUPER, INT_SOFTWARE_SUPER, INT_TIMER_SUPER,
+            INT_EXT_USER, INT_SOFTWARE_USER, INT_TIMER_USER
+        };
+        for (const int &id : interrupt_order) {
+            if (checkInterrupt(id) && mask[id]) {
+                return std::make_shared<InterruptFault>(id);
+            }
+        }
+    } else if (((ISA*) tc->getIsaPtr())->rvType() == RV32) {
+        const std::vector<int> interrupt_order {
+            INT_LOCAL_15, INT_LOCAL_14, INT_LOCAL_13, INT_LOCAL_12,
+            INT_LOCAL_11, INT_LOCAL_10, INT_LOCAL_9, INT_LOCAL_8,
+            INT_LOCAL_7, INT_LOCAL_6, INT_LOCAL_5, INT_LOCAL_4,
+            INT_LOCAL_3, INT_LOCAL_2, INT_LOCAL_1, INT_LOCAL_0,
+            INT_EXT_MACHINE, INT_SOFTWARE_MACHINE, INT_TIMER_MACHINE,
+            INT_EXT_SUPER, INT_SOFTWARE_SUPER, INT_TIMER_SUPER,
+            INT_EXT_USER, INT_SOFTWARE_USER, INT_TIMER_USER
+        };
+        for (const int &id : interrupt_order) {
+            if (checkInterrupt(id) && mask[id]) {
+                return std::make_shared<InterruptFault>(id);
+            }
+        }
+    }
+    return NoFault;
+}
+
+void
+Interrupts::post(int int_num, int index)
+{
+    DPRINTF(Interrupt, "Interrupt %d:%d posted\n", int_num, index);
+    if (int_num != INT_NMI) {
+        ip[int_num] = true;
+    } else {
+        postNMI();
+    }
+}
+
+void
+Interrupts::clear(int int_num, int index)
+{
+    DPRINTF(Interrupt, "Interrupt %d:%d cleared\n", int_num, index);
+    if (int_num != INT_NMI) {
+        ip[int_num] = false;
+    } else {
+        clearNMI();
+    }
+}
+
+void
+Interrupts::clearAll()
+{
+    DPRINTF(Interrupt, "All interrupts cleared\n");
+    ip = 0;
+    clearNMI();
+}
+
+void
+Interrupts::raiseInterruptPin(uint32_t num)
+{
+    tc->getCpuPtr()->postInterrupt(tc->threadId(), num + 16, 0);
+}
+
+void
+Interrupts::serialize(CheckpointOut &cp) const
+{
+    unsigned long ip_ulong = ip.to_ulong();
+    unsigned long ie_ulong = ie.to_ulong();
+    SERIALIZE_SCALAR(ip_ulong);
+    SERIALIZE_SCALAR(ie_ulong);
+}
+
+void
+Interrupts::unserialize(CheckpointIn &cp)
+{
+    unsigned long ip_ulong;
+    unsigned long ie_ulong;
+    UNSERIALIZE_SCALAR(ip_ulong);
+    ip = ip_ulong;
+    UNSERIALIZE_SCALAR(ie_ulong);
+    ie = ie_ulong;
+}
+
+Port &
+Interrupts::getPort(const std::string &if_name, PortID idx)
+{
+
+    if (if_name == "local_interrupt_pins" && idx < localInterruptPins.size()) {
+        return *localInterruptPins[idx];
+    } else {
+        return BaseInterrupts::getPort(if_name, idx);
+    }
+}
+
+} // namespace RiscvISA
+
+} // namespace gem5
diff --git a/src/arch/riscv/interrupts.hh b/src/arch/riscv/interrupts.hh
index b003b59426..a10479fb65 100644
--- a/src/arch/riscv/interrupts.hh
+++ b/src/arch/riscv/interrupts.hh
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2011 Google
+ * Copyright (c) 2024 University of Rostock
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,8 +37,10 @@
 #include "arch/riscv/faults.hh"
 #include "arch/riscv/regs/misc.hh"
 #include "base/logging.hh"
+#include "cpu/base.hh"
 #include "cpu/thread_context.hh"
 #include "debug/Interrupt.hh"
+#include "dev/intpin.hh"
 #include "params/RiscvInterrupts.hh"
 #include "sim/sim_object.hh"
 
@@ -59,71 +62,13 @@ class Interrupts : public BaseInterrupts
     std::bitset<NumInterruptTypes> ip;
     std::bitset<NumInterruptTypes> ie;
 
+    std::vector<gem5::IntSinkPin<Interrupts>*> localInterruptPins;
   public:
     using Params = RiscvInterruptsParams;
 
-    Interrupts(const Params &p) : BaseInterrupts(p), ip(0), ie(0) {}
+    Interrupts(const Params &p);
 
-    std::bitset<NumInterruptTypes>
-    globalMask() const
-    {
-        INTERRUPT mask = 0;
-        STATUS status = tc->readMiscReg(MISCREG_STATUS);
-        MISA misa = tc->readMiscRegNoEffect(MISCREG_ISA);
-        INTERRUPT mideleg = 0;
-        if (misa.rvs || misa.rvn) {
-            mideleg = tc->readMiscReg(MISCREG_MIDELEG);
-        }
-        INTERRUPT sideleg = 0;
-        if (misa.rvs && misa.rvn) {
-            sideleg = tc->readMiscReg(MISCREG_SIDELEG);
-        }
-        PrivilegeMode prv = (PrivilegeMode)tc->readMiscReg(MISCREG_PRV);
-        switch (prv) {
-            case PRV_U:
-                // status.uie is always 0 if misa.rvn is disabled
-                if (misa.rvs) {
-                    mask.mei = (!sideleg.mei) | (sideleg.mei & status.uie);
-                    mask.mti = (!sideleg.mti) | (sideleg.mti & status.uie);
-                    mask.msi = (!sideleg.msi) | (sideleg.msi & status.uie);
-                    mask.sei = (!sideleg.sei) | (sideleg.sei & status.uie);
-                    mask.sti = (!sideleg.sti) | (sideleg.sti & status.uie);
-                    mask.ssi = (!sideleg.ssi) | (sideleg.ssi & status.uie);
-                } else {
-                    // According to the RISC-V privilege spec v1.10, if the
-                    // S privilege mode is not implemented and user-trap
-                    // support, setting mideleg/medeleg bits will delegate the
-                    // trap to U-mode trap handler
-                    mask.mei = (!mideleg.mei) | (mideleg.mei & status.uie);
-                    mask.mti = (!mideleg.mti) | (mideleg.mti & status.uie);
-                    mask.msi = (!mideleg.msi) | (mideleg.msi & status.uie);
-                    mask.sei = mask.sti = mask.ssi = 0;
-                }
-                if (status.uie)
-                    mask.uei = mask.uti = mask.usi = 1;
-                break;
-            case PRV_S:
-                // status.sie is always 0 if misa.rvn is disabled
-                mask.mei = (!mideleg.mei) | (mideleg.mei & status.sie);
-                mask.mti = (!mideleg.mti) | (mideleg.mti & status.sie);
-                mask.msi = (!mideleg.msi) | (mideleg.msi & status.sie);
-                if (status.sie)
-                    mask.sei = mask.sti = mask.ssi = 1;
-                mask.uei = mask.uti = mask.usi = 0;
-                break;
-            case PRV_M:
-                if (status.mie)
-                     mask.mei = mask.mti = mask.msi = 1;
-                mask.sei = mask.sti = mask.ssi = 0;
-                mask.uei = mask.uti = mask.usi = 0;
-                break;
-            default:
-                panic("Unknown privilege mode %d.", prv);
-                break;
-        }
-
-        return std::bitset<NumInterruptTypes>(mask);
-    }
+    std::bitset<NumInterruptTypes> globalMask() const;
 
     bool
     checkNonMaskableInterrupt() const
@@ -137,83 +82,32 @@ class Interrupts : public BaseInterrupts
         return checkNonMaskableInterrupt() || (ip & ie & globalMask()).any();
     }
 
-    Fault
-    getInterrupt() override
-    {
-        assert(checkInterrupts());
-        if (checkNonMaskableInterrupt())
-            return std::make_shared<NonMaskableInterruptFault>();
-        std::bitset<NumInterruptTypes> mask = globalMask();
-        const std::vector<int> interrupt_order {
-            INT_EXT_MACHINE, INT_SOFTWARE_MACHINE, INT_TIMER_MACHINE,
-            INT_EXT_SUPER, INT_SOFTWARE_SUPER, INT_TIMER_SUPER,
-            INT_EXT_USER, INT_SOFTWARE_USER, INT_TIMER_USER
-        };
-        for (const int &id : interrupt_order)
-            if (checkInterrupt(id) && mask[id])
-                return std::make_shared<InterruptFault>(id);
-        return NoFault;
-    }
+    Fault getInterrupt() override;
 
     void updateIntrInfo() override {}
 
-    void
-    post(int int_num, int index) override
-    {
-        DPRINTF(Interrupt, "Interrupt %d:%d posted\n", int_num, index);
-        if (int_num != INT_NMI) {
-            ip[int_num] = true;
-        } else {
-            postNMI();
-        }
-    }
+    void post(int int_num, int index) override;
 
-    void
-    clear(int int_num, int index) override
-    {
-        DPRINTF(Interrupt, "Interrupt %d:%d cleared\n", int_num, index);
-        if (int_num != INT_NMI) {
-            ip[int_num] = false;
-        } else {
-            clearNMI();
-        }
-    }
+    void clear(int int_num, int index) override;
 
     void postNMI() { tc->setMiscReg(MISCREG_NMIP, 1); }
     void clearNMI() { tc->setMiscReg(MISCREG_NMIP, 0); }
 
-    void
-    clearAll() override
-    {
-        DPRINTF(Interrupt, "All interrupts cleared\n");
-        ip = 0;
-        clearNMI();
-    }
+    void clearAll() override;
 
     uint64_t readIP() const { return (uint64_t)ip.to_ulong(); }
     uint64_t readIE() const { return (uint64_t)ie.to_ulong(); }
     void setIP(const uint64_t& val) { ip = val; }
     void setIE(const uint64_t& val) { ie = val; }
 
-    void
-    serialize(CheckpointOut &cp) const override
-    {
-        unsigned long ip_ulong = ip.to_ulong();
-        unsigned long ie_ulong = ie.to_ulong();
-        SERIALIZE_SCALAR(ip_ulong);
-        SERIALIZE_SCALAR(ie_ulong);
-    }
+    void serialize(CheckpointOut &cp) const override;
 
-    void
-    unserialize(CheckpointIn &cp) override
-    {
-        unsigned long ip_ulong;
-        unsigned long ie_ulong;
-        UNSERIALIZE_SCALAR(ip_ulong);
-        ip = ip_ulong;
-        UNSERIALIZE_SCALAR(ie_ulong);
-        ie = ie_ulong;
-    }
+    void unserialize(CheckpointIn &cp) override;
+
+    Port &getPort(const std::string &if_name, PortID idx) override;
+
+    void raiseInterruptPin(uint32_t num);
+    void lowerInterruptPin(uint32_t num) {};
 };
 
 } // namespace RiscvISA
diff --git a/src/arch/riscv/isa.cc b/src/arch/riscv/isa.cc
index 18e9b5fce2..2d56055f79 100644
--- a/src/arch/riscv/isa.cc
+++ b/src/arch/riscv/isa.cc
@@ -240,6 +240,8 @@ namespace RiscvISA
     [MISCREG_HPMCOUNTER29H]  = "HPMCOUNTER29H",
     [MISCREG_HPMCOUNTER30H]  = "HPMCOUNTER30H",
     [MISCREG_HPMCOUNTER31H]  = "HPMCOUNTER31H",
+
+    [MISCREG_FFLAGS_EXE]    = "FFLAGS_EXE",
 }};
 
 namespace
@@ -254,9 +256,8 @@ RegClass ccRegClass(CCRegClass, CCRegClassName, 0, debug::IntRegs);
 
 } // anonymous namespace
 
-ISA::ISA(const Params &p) : BaseISA(p),
-    _rvType(p.riscv_type), checkAlignment(p.check_alignment),
-    enableRvv(p.enable_rvv), vlen(p.vlen), elen(p.elen),
+ISA::ISA(const Params &p) : BaseISA(p, "riscv"),
+    _rvType(p.riscv_type), enableRvv(p.enable_rvv), vlen(p.vlen), elen(p.elen),
     _privilegeModeSet(p.privilege_mode_set)
 {
     _regClasses.push_back(&intRegClass);
@@ -276,7 +277,7 @@ ISA::ISA(const Params &p) : BaseISA(p),
             p.vlen, p.elen);
 
 
-    miscRegFile.resize(NUM_MISCREGS);
+    miscRegFile.resize(NUM_PHYS_MISCREGS);
     clear();
 }
 
@@ -304,7 +305,7 @@ ISA::copyRegsFrom(ThreadContext *src)
     }
 
     // Copying Misc Regs
-    for (int i = 0; i < NUM_MISCREGS; i++)
+    for (int i = 0; i < NUM_PHYS_MISCREGS; i++)
         tc->setMiscRegNoEffect(i, src->readMiscRegNoEffect(i));
 
     // Lastly copy PC/NPC
@@ -409,7 +410,7 @@ RegVal
 ISA::readMiscRegNoEffect(RegIndex idx) const
 {
     // Illegal CSR
-    panic_if(idx > NUM_MISCREGS, "Illegal CSR index %#x\n", idx);
+    panic_if(idx > NUM_PHYS_MISCREGS, "Illegal CSR index %#x\n", idx);
     DPRINTF(RiscvMisc, "Reading MiscReg %s (%d): %#x.\n",
             MiscRegNames[idx], idx, miscRegFile[idx]);
     return miscRegFile[idx];
@@ -481,12 +482,28 @@ ISA::readMiscReg(RegIndex idx)
                     tc->getCpuPtr()->getInterruptController(tc->threadId()));
             return ic->readIP();
         }
+      case MISCREG_UIP:
+        {
+            return readMiscReg(MISCREG_IP) & UI_MASK[getPrivilegeModeSet()];
+        }
+      case MISCREG_SIP:
+        {
+            return readMiscReg(MISCREG_IP) & SI_MASK[getPrivilegeModeSet()];
+        }
       case MISCREG_IE:
         {
             auto ic = dynamic_cast<RiscvISA::Interrupts *>(
                     tc->getCpuPtr()->getInterruptController(tc->threadId()));
             return ic->readIE();
         }
+      case MISCREG_UIE:
+        {
+            return readMiscReg(MISCREG_IE) & UI_MASK[getPrivilegeModeSet()];
+        }
+      case MISCREG_SIE:
+        {
+            return readMiscReg(MISCREG_IE) & SI_MASK[getPrivilegeModeSet()];
+        }
       case MISCREG_SEPC:
       case MISCREG_MEPC:
         {
@@ -550,6 +567,16 @@ ISA::readMiscReg(RegIndex idx)
 
             return readMiscRegNoEffect(idx);
         }
+      case MISCREG_USTATUS:
+        {
+           return readMiscReg(MISCREG_STATUS) &
+                  USTATUS_MASKS[rvType()][getPrivilegeModeSet()];
+        }
+      case MISCREG_SSTATUS:
+        {
+           return readMiscReg(MISCREG_STATUS) &
+                  SSTATUS_MASKS[rvType()][getPrivilegeModeSet()];
+        }
       case MISCREG_VLENB:
         {
             auto rpc = tc->pcState().as<PCState>();
@@ -567,10 +594,19 @@ ISA::readMiscReg(RegIndex idx)
         }
       case MISCREG_VCSR:
         {
-            return readMiscRegNoEffect(MISCREG_VXSAT) &
+            return readMiscRegNoEffect(MISCREG_VXSAT) |
                   (readMiscRegNoEffect(MISCREG_VXRM) << 1);
         }
         break;
+      case MISCREG_FFLAGS_EXE:
+        {
+            return readMiscRegNoEffect(MISCREG_FFLAGS) & FFLAGS_MASK;
+        }
+      case MISCREG_FCSR:
+        {
+            return readMiscRegNoEffect(MISCREG_FFLAGS) |
+                  (readMiscRegNoEffect(MISCREG_FRM) << FRM_OFFSET);
+        }
       default:
         // Try reading HPM counters
         // As a placeholder, all HPM counters are just cycle counters
@@ -603,7 +639,7 @@ void
 ISA::setMiscRegNoEffect(RegIndex idx, RegVal val)
 {
     // Illegal CSR
-    panic_if(idx > NUM_MISCREGS, "Illegal CSR index %#x\n", idx);
+    panic_if(idx > NUM_PHYS_MISCREGS, "Illegal CSR index %#x\n", idx);
     DPRINTF(RiscvMisc, "Setting MiscReg %s (%d) to %#x.\n",
             MiscRegNames[idx], idx, val);
     miscRegFile[idx] = val;
@@ -688,18 +724,48 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
 
           case MISCREG_IP:
             {
+                val = val & MI_MASK[getPrivilegeModeSet()];
                 auto ic = dynamic_cast<RiscvISA::Interrupts *>(
                     tc->getCpuPtr()->getInterruptController(tc->threadId()));
                 ic->setIP(val);
             }
             break;
+          case MISCREG_UIP:
+            {
+                RegVal mask = UI_MASK[getPrivilegeModeSet()];
+                val = (val & mask) | (readMiscReg(MISCREG_IP) & ~mask);
+                setMiscReg(MISCREG_IP, val);
+            }
+            break;
+          case MISCREG_SIP:
+            {
+                RegVal mask = SI_MASK[getPrivilegeModeSet()];
+                val = (val & mask) | (readMiscReg(MISCREG_IP) & ~mask);
+                setMiscReg(MISCREG_IP, val);
+            }
+            break;
           case MISCREG_IE:
             {
+                val = val & MI_MASK[getPrivilegeModeSet()];
                 auto ic = dynamic_cast<RiscvISA::Interrupts *>(
                     tc->getCpuPtr()->getInterruptController(tc->threadId()));
                 ic->setIE(val);
             }
             break;
+          case MISCREG_UIE:
+            {
+                RegVal mask = UI_MASK[getPrivilegeModeSet()];
+                val = (val & mask) | (readMiscReg(MISCREG_IE) & ~mask);
+                setMiscReg(MISCREG_IE, val);
+            }
+            break;
+          case MISCREG_SIE:
+            {
+                RegVal mask = SI_MASK[getPrivilegeModeSet()];
+                val = (val & mask) | (readMiscReg(MISCREG_IE) & ~mask);
+                setMiscReg(MISCREG_IE, val);
+            }
+            break;
           case MISCREG_SATP:
             {
                 // we only support bare and Sv39 mode; setting a different mode
@@ -741,6 +807,7 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
             break;
           case MISCREG_STATUS:
             {
+                val = val & MSTATUS_MASKS[rvType()][getPrivilegeModeSet()];
                 if (_rvType != RV32) {
                     // SXL and UXL are hard-wired to 64 bit
                     auto cur = readMiscRegNoEffect(idx);
@@ -754,6 +821,22 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
                 setMiscRegNoEffect(idx, val);
             }
             break;
+          case MISCREG_USTATUS:
+            {
+                RegVal mask = USTATUS_MASKS[rvType()][getPrivilegeModeSet()];
+                val = (val & mask) |
+                      (readMiscRegNoEffect(MISCREG_STATUS) & ~mask);
+                setMiscReg(MISCREG_STATUS, val);
+            }
+            break;
+          case MISCREG_SSTATUS:
+            {
+                RegVal mask = SSTATUS_MASKS[rvType()][getPrivilegeModeSet()];
+                val = (val & mask) |
+                      (readMiscRegNoEffect(MISCREG_STATUS) & ~mask);
+                setMiscReg(MISCREG_STATUS, val);
+            }
+            break;
           case MISCREG_VXSAT:
             {
                 setMiscRegNoEffect(idx, val & 0x1);
@@ -770,6 +853,29 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
                 setMiscRegNoEffect(MISCREG_VXRM, (val & 0x6) >> 1);
             }
             break;
+          case MISCREG_FFLAGS_EXE:
+            {
+                RegVal new_val = readMiscRegNoEffect(MISCREG_FFLAGS);
+                new_val |= (val & FFLAGS_MASK);
+                setMiscRegNoEffect(MISCREG_FFLAGS, new_val);
+            }
+            break;
+          case MISCREG_FFLAGS:
+            {
+                setMiscRegNoEffect(MISCREG_FFLAGS, val & FFLAGS_MASK);
+            }
+            break;
+          case MISCREG_FRM:
+            {
+                setMiscRegNoEffect(MISCREG_FRM, val & FRM_MASK);
+            }
+            break;
+          case MISCREG_FCSR:
+            {
+                setMiscRegNoEffect(MISCREG_FFLAGS, bits(val, 4, 0));
+                setMiscRegNoEffect(MISCREG_FRM, bits(val, 7, 5));
+            }
+            break;
           default:
             setMiscRegNoEffect(idx, val);
         }
@@ -779,6 +885,8 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
 void
 ISA::serialize(CheckpointOut &cp) const
 {
+    BaseISA::serialize(cp);
+
     DPRINTF(Checkpoint, "Serializing Riscv Misc Registers\n");
     SERIALIZE_CONTAINER(miscRegFile);
 }
diff --git a/src/arch/riscv/isa.hh b/src/arch/riscv/isa.hh
index 9a24a76745..9c0fd70f35 100644
--- a/src/arch/riscv/isa.hh
+++ b/src/arch/riscv/isa.hh
@@ -74,7 +74,6 @@ class ISA : public BaseISA
   protected:
     RiscvType _rvType;
     std::vector<RegVal> miscRegFile;
-    bool checkAlignment;
     bool enableRvv;
 
     bool hpmCounterEnabled(int counter) const;
@@ -132,8 +131,6 @@ class ISA : public BaseISA
         return CSRMasks[_rvType][_privilegeModeSet];
     }
 
-    bool alignmentCheckEnabled() const { return checkAlignment; }
-
     bool inUserMode() const override;
     void copyRegsFrom(ThreadContext *src) override;
 
@@ -169,6 +166,8 @@ class ISA : public BaseISA
     unsigned getVecLenInBytes() { return vlen >> 3; }
     unsigned getVecElemLenInBits() { return elen; }
 
+    int64_t getVectorLengthInBytes() const override { return vlen >> 3; }
+
     PrivilegeModeSet getPrivilegeModeSet() { return _privilegeModeSet; }
 
     virtual Addr getFaultHandlerAddr(
diff --git a/src/arch/riscv/isa/bitfields.isa b/src/arch/riscv/isa/bitfields.isa
index 66ce74afe3..0b4fae7b82 100644
--- a/src/arch/riscv/isa/bitfields.isa
+++ b/src/arch/riscv/isa/bitfields.isa
@@ -39,7 +39,7 @@ def bitfield QUADRANT <1:0>;
 def bitfield OPCODE5 <6:2>;
 
 // R-Type
-def bitfield ALL	<31:0>;
+def bitfield ALL    <31:0>;
 def bitfield RD     <11:7>;
 def bitfield FUNCT3 <14:12>;
 def bitfield RS1    <19:15>;
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index be4b621a37..0eb10d89b2 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -5,6 +5,7 @@
 // Copyright (c) 2020 Barkhausen Institut
 // Copyright (c) 2021 StreamComputing Corp
 // Copyright (c) 2022 Google LLC
+// Copyright (c) 2024 University of Rostock
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -183,7 +184,7 @@ decode QUADRANT default Unknown::unknown() {
                         return std::make_shared<IllegalInstFault>("FPU is off",
                                                                    machInst);
 
-                    Mem_uw = unboxF32(boxF32(Fs2_bits));
+                    Mem_uw = unboxF32(boxF32(Fp2_bits));
                 }}, {{
                     EA = (uint32_t)(Rp1_uw + offset);
                 }});
@@ -426,7 +427,7 @@ decode QUADRANT default Unknown::unknown() {
 
                     freg_t fd;
                     fd = freg(f32(Mem_uw));
-                    Fd_bits = fd.v;
+                    Fc1_bits = fd.v;
                 }}, {{
                     EA = (uint32_t)(sp_uw + offset);
                 }});
@@ -447,12 +448,12 @@ decode QUADRANT default Unknown::unknown() {
         }
         0x4: decode CFUNCT1 {
             0x0: decode RC2 {
-                0x0: Jump::c_jr({{
+                0x0: CJump::c_jr({{
                     if (RC1 == 0) {
                         return std::make_shared<IllegalInstFault>(
                                 "source reg x0", machInst);
                     }
-                    NPC = rvZext(Rc1);
+                    NPC = rvZext(Rc1 & (~0x1));
                 }}, IsIndirectControl, IsUncondControl);
                 default: CROp::c_mv({{
                     // RC1 == 0 is HINT
@@ -462,12 +463,14 @@ decode QUADRANT default Unknown::unknown() {
             0x1: decode RC2 {
                 0x0: decode RC1 {
                     0x0: SystemOp::c_ebreak({{
+                        // NB: Semihosting spec requires uncompressed ebreak,
+                        // so this instruction does not check for semihosting.
                         return std::make_shared<BreakpointFault>(
                             xc->pcState());
                     }}, IsSerializeAfter, IsNonSpeculative, No_OpClass);
-                    default: Jump::c_jalr({{
+                    default: CJump::c_jalr({{
                         ra = rvSext(NPC);
-                        NPC = rvZext(Rc1);
+                        NPC = rvZext(Rc1 & (~0x1));
                     }}, IsIndirectControl, IsUncondControl, IsCall);
                 }
                 default: CompressedROp::c_add({{
@@ -508,7 +511,7 @@ decode QUADRANT default Unknown::unknown() {
                         return std::make_shared<IllegalInstFault>("FPU is off",
                                                                    machInst);
 
-                    Mem_uw = unboxF32(boxF32(Fs2_bits));
+                    Mem_uw = unboxF32(boxF32(Fc2_bits));
                 }}, {{
                     EA = (uint32_t)(sp_uw + offset);
                 }});
@@ -599,170 +602,442 @@ decode QUADRANT default Unknown::unknown() {
 
             0x0: decode MOP {
                 0x0: decode LUMOP {
-                    0x00: VleOp::vle8_v({{
-                        if ((machInst.vm || elem_mask(v0, ei)) &&
-                            i < this->microVl) {
-                            Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
-                        } else {
-                            Vd_ub[i] = Vs2_ub[i];
+                   0x00: decode NF {
+                        0x00: VleOp::vle8_v({{
+                            if ((machInst.vm || elem_mask(v0, ei)) &&
+                                i < this->microVl) {
+                                Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                            } else {
+                                Vd_ub[i] = Vs2_ub[i];
+                            }
+                        }}, inst_flags=SimdUnitStrideLoadOp);
+                        format VlSegOp {
+                            0x01: vlseg2e8_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) &&
+                                    i < this->microVl) {
+                                    Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                                } else {
+                                    Vd_ub[i] = Vs2_ub[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x02: vlseg3e8_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) &&
+                                    i < this->microVl) {
+                                    Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                                } else {
+                                    Vd_ub[i] = Vs2_ub[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x03: vlseg4e8_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) &&
+                                    i < this->microVl) {
+                                    Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                                } else {
+                                    Vd_ub[i] = Vs2_ub[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x04: vlseg5e8_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) &&
+                                    i < this->microVl) {
+                                    Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                                } else {
+                                    Vd_ub[i] = Vs2_ub[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x05: vlseg6e8_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) &&
+                                    i < this->microVl) {
+                                    Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                                } else {
+                                    Vd_ub[i] = Vs2_ub[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x06: vlseg7e8_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) &&
+                                    i < this->microVl) {
+                                    Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                                } else {
+                                    Vd_ub[i] = Vs2_ub[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x07: vlseg8e8_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) &&
+                                    i < this->microVl) {
+                                    Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                                } else {
+                                    Vd_ub[i] = Vs2_ub[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
                         }
-                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    }
                     0x08: decode NF {
                         format VlWholeOp {
                             0x0: vl1re8_v({{
                                 Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x1: vl2re8_v({{
                                 Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x3: vl4re8_v({{
                                 Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x7: vl8re8_v({{
                                 Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                         }
                     }
                     0x0b: VlmOp::vlm_v({{
                         Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
-                    }}, inst_flags=VectorUnitStrideMaskLoadOp);
+                    }}, inst_flags=SimdUnitStrideMaskLoadOp);
+                    0x10: VleOp::vle8ff_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl && i < this->faultIdx) {
+                            Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                        } else {
+                            Vd_ub[i] = Vs2_ub[i];
+                        }
+                    }}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp);
                 }
                 0x1: VlIndexOp::vluxei8_v({{
                     Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
                 }}, {{
                     EA = Rs1 + Vs2_ub[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedLoadOp);
+                }}, inst_flags=SimdIndexedLoadOp);
                 0x2: VlStrideOp::vlse8_v({{
                     Vd_ub[microIdx] = Mem_vc.as<uint8_t>()[0];
-                }}, inst_flags=VectorStridedLoadOp);
+                }}, inst_flags=SimdStridedLoadOp);
                 0x3: VlIndexOp::vloxei8_v({{
                     Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
                 }}, {{
                     EA = Rs1 + Vs2_ub[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedLoadOp);
+                }}, inst_flags=SimdIndexedLoadOp);
             }
             0x5: decode MOP {
                 0x0: decode LUMOP {
-                    0x00: VleOp::vle16_v({{
-                        if ((machInst.vm || elem_mask(v0, ei)) &&
-                            i < this->microVl) {
-                            Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
-                        } else {
-                            Vd_uh[i] = Vs2_uh[i];
+                    0x00: decode NF {
+                        0x00: VleOp::vle16_v({{
+                            if ((machInst.vm || elem_mask(v0, ei)) &&
+                                i < this->microVl) {
+                                Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                            } else {
+                                Vd_uh[i] = Vs2_uh[i];
+                            }
+                        }}, inst_flags=SimdUnitStrideLoadOp);
+                        format VlSegOp {
+                            0x01: vlseg2e16_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) &&
+                                    i < this->microVl) {
+                                    Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                                } else {
+                                    Vd_uh[i] = Vs2_uh[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x02: vlseg3e16_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) &&
+                                    i < this->microVl) {
+                                    Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                                } else {
+                                    Vd_uh[i] = Vs2_uh[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x03: vlseg4e16_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) &&
+                                    i < this->microVl) {
+                                    Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                                } else {
+                                    Vd_uh[i] = Vs2_uh[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x04: vlseg5e16_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) &&
+                                    i < this->microVl) {
+                                    Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                                } else {
+                                    Vd_uh[i] = Vs2_uh[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x05: vlseg6e16_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) &&
+                                    i < this->microVl) {
+                                    Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                                } else {
+                                    Vd_uh[i] = Vs2_uh[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x06: vlseg7e16_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) &&
+                                    i < this->microVl) {
+                                    Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                                } else {
+                                    Vd_uh[i] = Vs2_uh[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x07: vlseg8e16_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) &&
+                                    i < this->microVl) {
+                                    Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                                } else {
+                                    Vd_uh[i] = Vs2_uh[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
                         }
-                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    }
                     0x08: decode NF {
                         format VlWholeOp {
                             0x0: vl1re16_v({{
                                 Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x1: vl2re16_v({{
                                 Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x3: vl4re16_v({{
                                 Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x7: vl8re16_v({{
                                 Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                         }
                     }
+                    0x10: VleOp::vle16ff_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl && i < this->faultIdx) {
+                            Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                        } else {
+                            Vd_uh[i] = Vs2_uh[i];
+                        }
+                    }}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp);
                 }
                 0x1: VlIndexOp::vluxei16_v({{
                     Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
                 }}, {{
                     EA = Rs1 + Vs2_uh[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedLoadOp);
+                }}, inst_flags=SimdIndexedLoadOp);
                 0x2: VlStrideOp::vlse16_v({{
                     Vd_uh[microIdx] = Mem_vc.as<uint16_t>()[0];
-                }}, inst_flags=VectorStridedLoadOp);
+                }}, inst_flags=SimdStridedLoadOp);
                 0x3: VlIndexOp::vloxei16_v({{
                     Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
                 }}, {{
                     EA = Rs1 + Vs2_uh[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedLoadOp);
+                }}, inst_flags=SimdIndexedLoadOp);
             }
             0x6: decode MOP {
                 0x0: decode LUMOP {
-                    0x00: VleOp::vle32_v({{
-                        if ((machInst.vm || elem_mask(v0, ei)) &&
-                            i < this->microVl) {
-                            Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
-                        } else {
-                            Vd_uw[i] = Vs2_uw[i];
+                    0x00: decode NF {
+                        0x00: VleOp::vle32_v({{
+                            if ((machInst.vm || elem_mask(v0, ei)) &&
+                                i < this->microVl) {
+                                Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                            } else {
+                                Vd_uw[i] = Vs2_uw[i];
+                            }
+                        }}, inst_flags=SimdUnitStrideLoadOp);
+                        format VlSegOp {
+                            0x01: vlseg2e32_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) &&
+                                    i < this->microVl) {
+                                    Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                                } else {
+                                    Vd_uw[i] = Vs2_uw[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x02: vlseg3e32_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) &&
+                                    i < this->microVl) {
+                                    Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                                } else {
+                                    Vd_uw[i] = Vs2_uw[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x03: vlseg4e32_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) &&
+                                    i < this->microVl) {
+                                    Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                                } else {
+                                    Vd_uw[i] = Vs2_uw[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x04: vlseg5e32_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) &&
+                                    i < this->microVl) {
+                                    Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                                } else {
+                                    Vd_uw[i] = Vs2_uw[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x05: vlseg6e32_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) &&
+                                    i < this->microVl) {
+                                    Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                                } else {
+                                    Vd_uw[i] = Vs2_uw[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x06: vlseg7e32_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) &&
+                                    i < this->microVl) {
+                                    Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                                } else {
+                                    Vd_uw[i] = Vs2_uw[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x07: vlseg8e32_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) &&
+                                    i < this->microVl) {
+                                    Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                                } else {
+                                    Vd_uw[i] = Vs2_uw[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
                         }
-                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    }
                     0x08: decode NF {
                         format VlWholeOp {
                             0x0: vl1re32_v({{
                                 Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x1: vl2re32_v({{
                                 Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x3: vl4re32_v({{
                                 Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x7: vl8re32_v({{
                                 Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                         }
                     }
+                    0x10: VleOp::vle32ff_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl && i < this->faultIdx) {
+                            Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                        } else {
+                            Vd_uw[i] = Vs2_uw[i];
+                        }
+                    }}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp);
                 }
                 0x1: VlIndexOp::vluxei32_v({{
                     Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
                 }}, {{
                     EA = Rs1 + Vs2_uw[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedLoadOp);
+                }}, inst_flags=SimdIndexedLoadOp);
                 0x2: VlStrideOp::vlse32_v({{
                     Vd_uw[microIdx] = Mem_vc.as<uint32_t>()[0];
-                }}, inst_flags=VectorStridedLoadOp);
+                }}, inst_flags=SimdStridedLoadOp);
                 0x3: VlIndexOp::vloxei32_v({{
                     Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
                 }}, {{
                     EA = Rs1 + Vs2_uw[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedLoadOp);
+                }}, inst_flags=SimdIndexedLoadOp);
             }
             0x7: decode MOP {
                 0x0: decode LUMOP {
-                    0x00: VleOp::vle64_v({{
-                        if ((machInst.vm || elem_mask(v0, ei)) &&
-                            i < this->microVl) {
-                            Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
-                        } else {
-                            Vd_ud[i] = Vs2_ud[i];
+                    0x00: decode NF {
+                        0x00: VleOp::vle64_v({{
+                            if ((machInst.vm || elem_mask(v0, ei)) &&
+                                i < this->microVl) {
+                                Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                            } else {
+                                Vd_ud[i] = Vs2_ud[i];
+                            }
+                        }}, inst_flags=SimdUnitStrideLoadOp);
+                        format VlSegOp {
+                            0x01: vlseg2e64_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) &&
+                                    i < this->microVl) {
+                                    Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                                } else {
+                                    Vd_ud[i] = Vs2_ud[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x02: vlseg3e64_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) &&
+                                    i < this->microVl) {
+                                    Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                                } else {
+                                    Vd_ud[i] = Vs2_ud[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x03: vlseg4e64_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) &&
+                                    i < this->microVl) {
+                                    Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                                } else {
+                                    Vd_ud[i] = Vs2_ud[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x04: vlseg5e64_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) &&
+                                    i < this->microVl) {
+                                    Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                                } else {
+                                    Vd_ud[i] = Vs2_ud[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x05: vlseg6e64_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) &&
+                                    i < this->microVl) {
+                                    Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                                } else {
+                                    Vd_ud[i] = Vs2_ud[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x06: vlseg7e64_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) &&
+                                    i < this->microVl) {
+                                    Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                                } else {
+                                    Vd_ud[i] = Vs2_ud[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
+                            0x07: vlseg8e64_v({{
+                                if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) &&
+                                    i < this->microVl) {
+                                    Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                                } else {
+                                    Vd_ud[i] = Vs2_ud[i];
+                                }
+                            }}, inst_flags=SimdUnitStrideSegmentedLoadOp);
                         }
-                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    }
                     0x08: decode NF {
                         format VlWholeOp {
                             0x0: vl1re64_v({{
                                 Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x1: vl2re64_v({{
                                 Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x3: vl4re64_v({{
                                 Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                             0x7: vl8re64_v({{
                                 Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
-                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            }}, inst_flags=SimdWholeRegisterLoadOp);
                         }
                     }
+                    0x10: VleOp::vle64ff_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl && i < this->faultIdx) {
+                            Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                        } else {
+                            Vd_ud[i] = Vs2_ud[i];
+                        }
+                    }}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp);
                 }
                 0x1: VlIndexOp::vluxei64_v({{
                     Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
                 }}, {{
                     EA = Rs1 + Vs2_ud[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedLoadOp);
+                }}, inst_flags=SimdIndexedLoadOp);
                 0x2: VlStrideOp::vlse64_v({{
                     Vd_ud[microIdx] = Mem_vc.as<uint64_t>()[0];
-                }}, inst_flags=VectorStridedLoadOp);
+                }}, inst_flags=SimdStridedLoadOp);
                 0x3: VlIndexOp::vloxei64_v({{
                     Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
                 }}, {{
                     EA = Rs1 + Vs2_ud[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedLoadOp);
+                }}, inst_flags=SimdIndexedLoadOp);
             }
         }
 
@@ -771,7 +1046,8 @@ decode QUADRANT default Unknown::unknown() {
                 0x0: fence({{
                 }}, uint64_t, IsReadBarrier, IsWriteBarrier, No_OpClass);
                 0x1: fence_i({{
-                }}, uint64_t, IsNonSpeculative, IsSerializeAfter, No_OpClass);
+                }}, uint64_t, IsNonSpeculative, IsSerializeAfter,
+                    IsSquashAfter, No_OpClass);
             }
 
             0x2: decode FUNCT12 {
@@ -1090,102 +1366,202 @@ decode QUADRANT default Unknown::unknown() {
 
             0x0: decode MOP {
                 0x0: decode SUMOP {
-                    0x00: VseOp::vse8_v({{
-                        Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
-                    }}, inst_flags=VectorUnitStrideStoreOp);
+                    0x00: decode NF {
+                        0x00: VseOp::vse8_v({{
+                            Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                        }}, inst_flags=SimdUnitStrideStoreOp);
+                        format VsSegOp {
+                            0x01: vsseg2e8_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x02: vsseg3e8_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x03: vsseg4e8_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x04: vsseg5e8_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x05: vsseg6e8_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x06: vsseg7e8_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x07: vsseg8e8_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                        }
+                    }
                     format VsWholeOp {
                         0x8: decode NF {
                             0x0: vs1r_v({{
                                 Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
-                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                            }}, inst_flags=SimdWholeRegisterStoreOp);
                             0x1: vs2r_v({{
                                 Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
-                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                            }}, inst_flags=SimdWholeRegisterStoreOp);
                             0x3: vs4r_v({{
                                 Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
-                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                            }}, inst_flags=SimdWholeRegisterStoreOp);
                             0x7: vs8r_v({{
                                 Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
-                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                            }}, inst_flags=SimdWholeRegisterStoreOp);
                         }
                     }
                     0x0b: VsmOp::vsm_v({{
                         Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
-                    }}, inst_flags=VectorUnitStrideMaskStoreOp);
+                    }}, inst_flags=SimdUnitStrideMaskStoreOp);
                 }
                 0x1: VsIndexOp::vsuxei8_v({{
                     Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
                 }}, {{
                     EA = Rs1 + Vs2_ub[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedStoreOp);
+                }}, inst_flags=SimdIndexedStoreOp);
                 0x2: VsStrideOp::vsse8_v({{
                     Mem_vc.as<uint8_t>()[0] = Vs3_ub[microIdx];
-                }}, inst_flags=VectorStridedStoreOp);
+                }}, inst_flags=SimdStridedStoreOp);
                 0x3: VsIndexOp::vsoxei8_v({{
                     Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
                 }}, {{
                     EA = Rs1 + Vs2_ub[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedStoreOp);
+                }}, inst_flags=SimdIndexedStoreOp);
             }
             0x5: decode MOP {
                 0x0: decode SUMOP {
-                    0x00: VseOp::vse16_v({{
-                        Mem_vc.as<uint16_t>()[i] = Vs3_uh[i];
-                    }}, inst_flags=VectorUnitStrideStoreOp);
+                    0x00: decode NF {
+                        0x00: VseOp::vse16_v({{
+                            Mem_vc.as<uint16_t>()[i] = Vs3_uh[i];
+                        }}, inst_flags=SimdUnitStrideStoreOp);
+                        format VsSegOp {
+                            0x01: vsseg2e16_v({{
+                                Mem_vc.as<uint16_t>()[i] = Vs3_uh[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x02: vsseg3e16_v({{
+                                Mem_vc.as<uint16_t>()[i] = Vs3_uh[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x03: vsseg4e16_v({{
+                                Mem_vc.as<uint16_t>()[i] = Vs3_uh[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x04: vsseg5e16_v({{
+                                Mem_vc.as<uint16_t>()[i] = Vs3_uh[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x05: vsseg6e16_v({{
+                                Mem_vc.as<uint16_t>()[i] = Vs3_uh[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x06: vsseg7e16_v({{
+                                Mem_vc.as<uint16_t>()[i] = Vs3_uh[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x07: vsseg8e16_v({{
+                                Mem_vc.as<uint16_t>()[i] = Vs3_uh[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                        }
+                    }
                 }
                 0x1: VsIndexOp::vsuxei16_v({{
                     Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
                 }}, {{
                     EA = Rs1 + Vs2_uh[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedStoreOp);
+                }}, inst_flags=SimdIndexedStoreOp);
                 0x2: VsStrideOp::vsse16_v({{
                     Mem_vc.as<uint16_t>()[0] = Vs3_uh[microIdx];
-                }}, inst_flags=VectorStridedStoreOp);
+                }}, inst_flags=SimdStridedStoreOp);
                 0x3: VsIndexOp::vsoxei16_v({{
                     Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
                 }}, {{
                     EA = Rs1 + Vs2_uh[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedStoreOp);
+                }}, inst_flags=SimdIndexedStoreOp);
             }
             0x6: decode MOP {
                 0x0: decode SUMOP {
-                    0x00: VseOp::vse32_v({{
-                        Mem_vc.as<uint32_t>()[i] = Vs3_uw[i];
-                    }}, inst_flags=VectorUnitStrideStoreOp);
+                    0x00: decode NF {
+                        0x00: VseOp::vse32_v({{
+                            Mem_vc.as<uint32_t>()[i] = Vs3_uw[i];
+                        }}, inst_flags=SimdUnitStrideStoreOp);
+                        format VsSegOp {
+                            0x01: vsseg2e32_v({{
+                                Mem_vc.as<uint32_t>()[i] = Vs3_uw[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x02: vsseg3e32_v({{
+                                Mem_vc.as<uint32_t>()[i] = Vs3_uw[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x03: vsseg4e32_v({{
+                                Mem_vc.as<uint32_t>()[i] = Vs3_uw[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x04: vsseg5e32_v({{
+                                Mem_vc.as<uint32_t>()[i] = Vs3_uw[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x05: vsseg6e32_v({{
+                                Mem_vc.as<uint32_t>()[i] = Vs3_uw[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x06: vsseg7e32_v({{
+                                Mem_vc.as<uint32_t>()[i] = Vs3_uw[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x07: vsseg8e32_v({{
+                                Mem_vc.as<uint32_t>()[i] = Vs3_uw[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                        }
+                    }
                 }
                 0x1: VsIndexOp::vsuxei32_v({{
                     Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
                 }}, {{
                     EA = Rs1 + Vs2_uw[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedStoreOp);
+                }}, inst_flags=SimdIndexedStoreOp);
                 0x2: VsStrideOp::vsse32_v({{
                     Mem_vc.as<uint32_t>()[0] = Vs3_uw[microIdx];
-                }}, inst_flags=VectorStridedStoreOp);
+                }}, inst_flags=SimdStridedStoreOp);
                 0x3: VsIndexOp::vsoxei32_v({{
                     Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
                 }}, {{
                     EA = Rs1 + Vs2_uw[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedStoreOp);
+                }}, inst_flags=SimdIndexedStoreOp);
             }
             0x7: decode MOP {
                 0x0: decode SUMOP {
-                    0x00: VseOp::vse64_v({{
-                        Mem_vc.as<uint64_t>()[i] = Vs3_ud[i];
-                    }}, inst_flags=VectorUnitStrideStoreOp);
+                    0x00: decode NF {
+                        0x00: VseOp::vse64_v({{
+                            Mem_vc.as<uint64_t>()[i] = Vs3_ud[i];
+                        }}, inst_flags=SimdUnitStrideStoreOp);
+                        format VsSegOp {
+                            0x01: vsseg2e64_v({{
+                                Mem_vc.as<uint64_t>()[i] = Vs3_ud[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x02: vsseg3e64_v({{
+                                Mem_vc.as<uint64_t>()[i] = Vs3_ud[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x03: vsseg4e64_v({{
+                                Mem_vc.as<uint64_t>()[i] = Vs3_ud[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x04: vsseg5e64_v({{
+                                Mem_vc.as<uint64_t>()[i] = Vs3_ud[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x05: vsseg6e64_v({{
+                                Mem_vc.as<uint64_t>()[i] = Vs3_ud[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x06: vsseg7e64_v({{
+                                Mem_vc.as<uint64_t>()[i] = Vs3_ud[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                            0x07: vsseg8e64_v({{
+                                Mem_vc.as<uint64_t>()[i] = Vs3_ud[i];
+                            }}, inst_flags=SimdUnitStrideSegmentedStoreOp);
+                        }
+                    }
                 }
                 0x1: VsIndexOp::vsuxei64_v({{
                     Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
                 }}, {{
                     EA = Rs1 + Vs2_ud[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedStoreOp);
+                }}, inst_flags=SimdIndexedStoreOp);
                 0x2: VsStrideOp::vsse64_v({{
                     Mem_vc.as<uint64_t>()[0] = Vs3_ud[microIdx];
-                }}, inst_flags=VectorStridedStoreOp);
+                }}, inst_flags=SimdStridedStoreOp);
                 0x3: VsIndexOp::vsoxei64_v({{
                     Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
                 }}, {{
                     EA = Rs1 + Vs2_ud[vs2ElemIdx];
-                }}, inst_flags=VectorIndexedStoreOp);
+                }}, inst_flags=SimdIndexedStoreOp);
             }
         }
 
@@ -1467,9 +1843,9 @@ decode QUADRANT default Unknown::unknown() {
                     }});
                     0x1: mulh({{
                         if (machInst.rv_type == RV32) {
-                            Rd_sd = mulh_32(Rs1_sd, Rs2_sd);
+                            Rd_sd = mulh<int32_t>(Rs1_sd, Rs2_sd);
                         } else {
-                            Rd_sd = mulh_64(Rs1_sd, Rs2_sd);
+                            Rd_sd = mulh<int64_t>(Rs1_sd, Rs2_sd);
                         }
                     }}, IntMultOp);
                     0x5: clmul({{
@@ -1506,9 +1882,9 @@ decode QUADRANT default Unknown::unknown() {
                     }});
                     0x1: mulhsu({{
                         if (machInst.rv_type == RV32) {
-                            Rd_sd = mulhsu_32(Rs1_sd, Rs2);
+                            Rd_sd = mulhsu<int32_t>(Rs1_sd, Rs2);
                         } else {
-                            Rd_sd = mulhsu_64(Rs1_sd, Rs2);
+                            Rd_sd = mulhsu<int64_t>(Rs1_sd, Rs2);
                         }
                     }}, IntMultOp);
                     0x5: clmulr({{
@@ -1539,9 +1915,9 @@ decode QUADRANT default Unknown::unknown() {
                     }});
                     0x1: mulhu({{
                         if (machInst.rv_type == RV32) {
-                            Rd = (int32_t)mulhu_32(Rs1, Rs2);
+                            Rd = (int32_t)mulhu<uint32_t>(Rs1, Rs2);
                         } else {
-                            Rd = mulhu_64(Rs1, Rs2);
+                            Rd = mulhu<uint64_t>(Rs1, Rs2);
                         }
                     }}, IntMultOp);
                     0x5: clmulh({{
@@ -1604,6 +1980,9 @@ decode QUADRANT default Unknown::unknown() {
                             Rd = divu<uint64_t>(Rs1, Rs2);
                         }
                     }}, IntDivOp);
+                    0x7: czero_eqz({{
+                        Rd = rvSext(Rs2) == 0 ? 0 : rvSext(Rs1);
+                    }});
                     0x20: sra({{
                         Rd = rvSext(Rs1_sd) >> rvSelect(Rs2<4:0>, Rs2<5:0>);
                     }});
@@ -1661,6 +2040,9 @@ decode QUADRANT default Unknown::unknown() {
                     0x5: maxu({{
                         Rd = rvSext(std::max(rvZext(Rs1), rvZext(Rs2)));
                     }});
+                    0x7: czero_nez({{
+                        Rd = rvSext(Rs2) != 0 ? 0 : rvSext(Rs1);
+                    }});
                     0x20: andn({{
                         Rd = rvSext(Rs1 & (~Rs2));
                     }});
@@ -2399,35 +2781,35 @@ decode QUADRANT default Unknown::unknown() {
                 format VectorIntFormat {
                     0x0: vadd_vv({{
                         Vd_vu[i] = Vs2_vu[i] + Vs1_vu[i];
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdAddOp);
                     0x2: vsub_vv({{
                         Vd_vu[i] = Vs2_vu[i] - Vs1_vu[i];
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdAddOp);
                     0x4: vminu_vv({{
                         Vd_vu[i] = Vs2_vu[i] < Vs1_vu[i] ?
                                 Vs2_vu[i] : Vs1_vu[i];
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCmpOp);
                     0x5: vmin_vv({{
                         Vd_vi[i] = Vs2_vi[i] < Vs1_vi[i] ?
                                 Vs2_vi[i] : Vs1_vi[i];
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCmpOp);
                     0x6: vmaxu_vv({{
                         Vd_vu[i] = Vs2_vu[i] > Vs1_vu[i] ?
                                 Vs2_vu[i] : Vs1_vu[i];
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCmpOp);
                     0x7: vmax_vv({{
                         Vd_vi[i] = Vs2_vi[i] > Vs1_vi[i] ?
                                 Vs2_vi[i] : Vs1_vi[i];
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCmpOp);
                     0x9: vand_vv({{
                         Vd_vu[i] = Vs2_vu[i] & Vs1_vu[i];
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdAluOp);
                     0xa: vor_vv({{
                         Vd_vu[i] = Vs2_vu[i] | Vs1_vu[i];
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdAluOp);
                     0xb: vxor_vv({{
                         Vd_vu[i] = Vs2_vu[i] ^ Vs1_vu[i];
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdAluOp);
                 }
                 0x0c: VectorGatherFormat::vrgather_vv({{
                     for (uint32_t i = 0; i < microVl; i++) {
@@ -2441,7 +2823,7 @@ decode QUADRANT default Unknown::unknown() {
                             Vd_vu[i] = res;
                         }
                     }
-                }}, OPIVV, VectorMiscOp);
+                }}, OPIVV, SimdMiscOp);
                 0x0e: VectorGatherFormat::vrgatherei16_vv({{
                     for (uint32_t i = 0; i < microVl; i++) {
                         uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
@@ -2454,20 +2836,20 @@ decode QUADRANT default Unknown::unknown() {
                             Vd_vu[i + vd_bias] = res;
                         }
                     }
-                }}, OPIVV, VectorMiscOp);
+                }}, OPIVV, SimdMiscOp);
                 format VectorIntFormat {
                     0x10: decode VM {
                         0x0: vadc_vvm({{
                             Vd_vi[i] = Vs2_vi[i] + Vs1_vi[i]
                                     + elem_mask(v0, ei);
-                        }}, OPIVV, VectorIntegerArithOp);
+                        }}, OPIVV, SimdAddOp);
                         // the unmasked versions (vm=1) are reserved
                     }
                     0x12: decode VM {
                         0x0: vsbc_vvm({{
                             Vd_vi[i] = Vs2_vi[i] - Vs1_vi[i]
                                     - elem_mask(v0, ei);
-                        }}, OPIVV, VectorIntegerArithOp);
+                        }}, OPIVV, SimdAddOp);
                         // the unmasked versions (vm=1) are reserved
                     }
                     0x17: decode VM {
@@ -2475,11 +2857,11 @@ decode QUADRANT default Unknown::unknown() {
                             Vd_vu[i] = elem_mask(v0, ei)
                                     ? Vs1_vu[i]
                                     : Vs2_vu[i];
-                        }}, OPIVV, VectorIntegerArithOp);
+                        }}, OPIVV, SimdMiscOp);
                         0x1: decode VS2 {
                             0x0: vmv_v_v({{
                                 Vd_vu[i] = Vs1_vu[i];
-                            }}, OPIVV, VectorIntegerArithOp);
+                            }}, OPIVV, SimdMiscOp);
                         }
                     }
                 }
@@ -2487,19 +2869,19 @@ decode QUADRANT default Unknown::unknown() {
                     0x20: vsaddu_vv({{
                         Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], Vs1_vu[i],
                             vxsatptr);
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdAddOp);
                     0x21: vsadd_vv({{
                         Vd_vu[i] = sat_add<vi>(Vs2_vu[i], Vs1_vu[i],
                             vxsatptr);
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdAddOp);
                     0x22: vssubu_vv({{
                         Vd_vu[i] = sat_subu<vu>(Vs2_vu[i], Vs1_vu[i],
                             vxsatptr);
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdAddOp);
                     0x23: vssub_vv({{
                         Vd_vu[i] = sat_sub<vi>(Vs2_vu[i], Vs1_vu[i],
                             vxsatptr);
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdAddOp);
                     0x27: vsmul_vv({{
                         vi max = std::numeric_limits<vi>::max();
                         vi min = std::numeric_limits<vi>::min();
@@ -2516,18 +2898,18 @@ decode QUADRANT default Unknown::unknown() {
                         }
 
                         Vd_vi[i] = (vi)result;
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdMultOp);
                 }
                 format VectorIntFormat {
                     0x25: vsll_vv({{
                         Vd_vu[i] = Vs2_vu[i] << (Vs1_vu[i] & (sew - 1));
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdShiftOp);
                     0x28: vsrl_vv({{
                         Vd_vu[i] = Vs2_vu[i] >> (Vs1_vu[i] & (sew - 1));
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdShiftOp);
                     0x29: vsra_vv({{
                         Vd_vi[i] = Vs2_vi[i] >> (Vs1_vu[i] & (sew - 1));
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdShiftOp);
                     0x2a: vssrl_vv({{
                         int sh = Vs1_vu[i] & (sew - 1);
                         __uint128_t val = Vs2_vu[i];
@@ -2535,7 +2917,7 @@ decode QUADRANT default Unknown::unknown() {
                         val = int_rounding<__uint128_t>(val,
                             xc->readMiscReg(MISCREG_VXRM), sh);
                         Vd_vu[i] = val >> sh;
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdShiftOp);
                     0x2b: vssra_vv({{
                         int sh = Vs1_vi[i] & (sew - 1);
                         __int128_t val = Vs2_vi[i];
@@ -2543,17 +2925,17 @@ decode QUADRANT default Unknown::unknown() {
                         val = int_rounding<__int128_t>(val,
                             xc->readMiscReg(MISCREG_VXRM), sh);
                         Vd_vi[i] = val >> sh;
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdShiftOp);
                 }
                 format VectorReduceIntWideningFormat {
                     0x30: vwredsumu_vs({{
                         Vd_vwu[0] = reduce_loop(std::plus<vwu>(),
                             Vs1_vwu, Vs2_vu);
-                    }}, OPIVV, VectorIntegerReduceOp);
+                    }}, OPIVV, SimdReduceAddOp);
                     0x31: vwredsum_vs({{
                         Vd_vwu[0] = reduce_loop(std::plus<vwi>(),
                             Vs1_vwi, Vs2_vi);
-                    }}, OPIVV, VectorIntegerReduceOp);
+                    }}, OPIVV, SimdReduceAddOp);
                 }
                 format VectorIntMaskFormat {
                     0x11: decode VM {
@@ -2561,57 +2943,57 @@ decode QUADRANT default Unknown::unknown() {
                             Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                                 carry_out(Vs2_vu[i], Vs1_vu[i],
                                     elem_mask(v0, ei)));
-                        }}, OPIVV, VectorIntegerArithOp);
+                        }}, OPIVV, SimdAddOp);
                         0x1: vmadc_vv({{
                             Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                                 carry_out(Vs2_vu[i], Vs1_vu[i]));
-                        }}, OPIVV, VectorIntegerArithOp);
+                        }}, OPIVV, SimdAddOp);
                     }
                     0x13: decode VM {
                         0x0: vmsbc_vvm({{
                             Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                                 borrow_out(Vs2_vi[i], Vs1_vi[i],
                                     elem_mask(v0, ei)));
-                        }}, OPIVV, VectorIntegerArithOp);
+                        }}, OPIVV, SimdAddOp);
                         0x1: vmsbc_vv({{
                             Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                                 borrow_out(Vs2_vi[i], Vs1_vi[i]));
-                        }}, OPIVV, VectorIntegerArithOp);
+                        }}, OPIVV, SimdAddOp);
                     }
                     0x18: vmseq_vv({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vu[i] == Vs1_vu[i]));
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCmpOp);
                     0x19: vmsne_vv({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vu[i] != Vs1_vu[i]));
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCmpOp);
                     0x1a: vmsltu_vv({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vu[i] < Vs1_vu[i]));
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCmpOp);
                     0x1b: vmslt_vv({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vi[i] < Vs1_vi[i]));
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCmpOp);
                     0x1c: vmsleu_vv({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vu[i] <= Vs1_vu[i]));
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCmpOp);
                     0x1d: vmsle_vv({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vi[i] <= Vs1_vi[i]));
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCmpOp);
                 }
                 format VectorIntNarrowingFormat {
                     0x2c: vnsrl_wv({{
                         Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
                             ((vwu)Vs1_vu[i + offset] & (sew * 2 - 1)));
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdShiftOp);
                     0x2d: vnsra_wv({{
                         Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
                             ((vwu)Vs1_vu[i + offset] & (sew * 2 - 1)));
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdShiftOp);
                     0x2e: vnclipu_wv({{
                         vu max = std::numeric_limits<vu>::max();
                         uint64_t sign_mask =
@@ -2628,7 +3010,7 @@ decode QUADRANT default Unknown::unknown() {
                         }
 
                         Vd_vu[i + offset] = (vu)res;
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCvtOp);
                     0x2f: vnclip_wv({{
                         vi max = std::numeric_limits<vi>::max();
                         vi min = std::numeric_limits<vi>::min();
@@ -2647,7 +3029,7 @@ decode QUADRANT default Unknown::unknown() {
                         }
 
                         Vd_vi[i + offset] = (vi)res;
-                    }}, OPIVV, VectorIntegerArithOp);
+                    }}, OPIVV, SimdCvtOp);
                 }
             }
             // OPFVV
@@ -2656,66 +3038,66 @@ decode QUADRANT default Unknown::unknown() {
                     auto fd = fadd<et>(ftype<et>(Vs2_vu[i]),
                                        ftype<et>(Vs1_vu[i]));
                     Vd_vu[i] = fd.v;
-                }}, OPFVV, VectorFloatArithOp);
+                }}, OPFVV, SimdFloatAddOp);
                 0x01: VectorReduceFloatFormat::vfredusum_vs({{
                     Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
                         return fadd<et>(ftype<et>(src1), ftype<et>(src2));
                     }, Vs1_vu, Vs2_vu);
-                }}, OPFVV, VectorFloatReduceOp);
+                }}, OPFVV, SimdFloatReduceAddOp);
                 0x02: VectorFloatFormat::vfsub_vv({{
                     auto fd = fsub<et>(ftype<et>(Vs2_vu[i]),
                                        ftype<et>(Vs1_vu[i]));
                     Vd_vu[i] = fd.v;
-                }}, OPFVV, VectorFloatArithOp);
+                }}, OPFVV, SimdFloatAddOp);
                 0x03: VectorReduceFloatFormat::vfredosum_vs({{
                     Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
                         return fadd<et>(ftype<et>(src1), ftype<et>(src2));
                     }, Vs1_vu, Vs2_vu);
-                }}, OPFVV, VectorFloatReduceOp);
+                }}, OPFVV, SimdFloatReduceAddOp);
                 0x04: VectorFloatFormat::vfmin_vv({{
                     auto fd = fmin<et>(ftype<et>(Vs2_vu[i]),
                                        ftype<et>(Vs1_vu[i]));
                     Vd_vu[i] = fd.v;
-                }}, OPFVV, VectorFloatArithOp);
+                }}, OPFVV, SimdFloatCmpOp);
                 0x05: VectorReduceFloatFormat::vfredmin_vs({{
                     Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
                         return fmin<et>(ftype<et>(src1), ftype<et>(src2));
                     }, Vs1_vu, Vs2_vu);
-                }}, OPFVV, VectorFloatReduceOp);
+                }}, OPFVV, SimdFloatReduceCmpOp);
                 0x06: VectorFloatFormat::vfmax_vv({{
                     auto fd = fmax<et>(ftype<et>(Vs2_vu[i]),
                                        ftype<et>(Vs1_vu[i]));
                     Vd_vu[i] = fd.v;
-                }}, OPFVV, VectorFloatArithOp);
+                }}, OPFVV, SimdFloatCmpOp);
                 0x07: VectorReduceFloatFormat::vfredmax_vs({{
                     Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
                         return fmax<et>(ftype<et>(src1), ftype<et>(src2));
                     }, Vs1_vu, Vs2_vu);
-                }}, OPFVV, VectorFloatReduceOp);
+                }}, OPFVV, SimdFloatReduceCmpOp);
                 0x08: VectorFloatFormat::vfsgnj_vv({{
                     Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
                                          ftype<et>(Vs1_vu[i]),
                                          false, false).v;
-                }}, OPFVV, VectorFloatArithOp);
+                }}, OPFVV, SimdFloatCvtOp);
                 0x09: VectorFloatFormat::vfsgnjn_vv({{
                     Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
                                          ftype<et>(Vs1_vu[i]),
                                          true, false).v;
-                }}, OPFVV, VectorFloatArithOp);
+                }}, OPFVV, SimdFloatCvtOp);
                 0x0a: VectorFloatFormat::vfsgnjx_vv({{
                     Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
                                          ftype<et>(Vs1_vu[i]),
                                          false, true).v;
-                }}, OPFVV, VectorFloatArithOp);
+                }}, OPFVV, SimdFloatCvtOp);
                 // VWFUNARY0
                 0x10: decode VS1 {
                     0x00: decode VM {
                         // The encodings corresponding to the masked versions
                         // (vm=0) of vfmv.f.s are reserved
                         0x1: VectorNonSplitFormat::vfmv_f_s({{
-                            freg_t fd = freg(Vs2_vu[0]);
+                            freg_t fd = freg(ftype<et>(Vs2_vu[0]));
                             Fd_bits = fd.v;
-                        }}, OPFVV, VectorMiscOp);
+                        }}, OPFVV, SimdMiscOp);
                     }
                 }
                 0x12: decode VS1 {
@@ -2723,101 +3105,101 @@ decode QUADRANT default Unknown::unknown() {
                         0x00: vfcvt_xu_f_v({{
                             Vd_vu[i] = f_to_ui<et>(ftype<et>(Vs2_vu[i]),
                                                    softfloat_roundingMode);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x01: vfcvt_x_f_v({{
                             Vd_vu[i] = f_to_i<et>(ftype<et>(Vs2_vu[i]),
                                                   softfloat_roundingMode);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x02: vfcvt_f_xu_v({{
                             auto fd = ui_to_f<et>(Vs2_vu[i]);
                             Vd_vu[i] = fd.v;
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x03: vfcvt_f_x_v({{
                             auto fd = i_to_f<et>(Vs2_vu[i]);
                             Vd_vu[i] = fd.v;
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x06: vfcvt_rtz_xu_f_v({{
                             Vd_vu[i] = f_to_ui<et>(ftype<et>(Vs2_vu[i]),
                                                    softfloat_round_minMag);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x07: vfcvt_rtz_x_f_v({{
                             Vd_vu[i] = f_to_i<et>(ftype<et>(Vs2_vu[i]),
                                                   softfloat_round_minMag);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                     }
                     format VectorFloatWideningCvtFormat {
                         0x08: vfwcvt_xu_f_v({{
                             Vd_vwu[i] = f_to_wui<et>(
                                 ftype<et>(Vs2_vu[i + offset]),
                                 softfloat_roundingMode);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x09: vfwcvt_x_f_v({{
                             Vd_vwu[i] = f_to_wi<et>(
                                 ftype<et>(Vs2_vu[i + offset]),
                                 softfloat_roundingMode);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x0a: vfwcvt_f_xu_v({{
                             auto fd = ui_to_wf<vu>(Vs2_vu[i + offset]);
                             Vd_vwu[i] = fd.v;
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x0b: vfwcvt_f_x_v({{
                             auto fd = i_to_wf<vu>(Vs2_vu[i + offset]);
                             Vd_vwu[i] = fd.v;
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x0c: vfwcvt_f_f_v({{
                             auto fd = f_to_wf<et>(
                                 ftype<et>(Vs2_vu[i + offset]));
                             Vd_vwu[i] = fd.v;
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x0e: vfwcvt_rtz_xu_f_v({{
                             Vd_vwu[i] = f_to_wui<et>(
                                 ftype<et>(Vs2_vu[i + offset]),
                                 softfloat_round_minMag);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x0f: vfwcvt_rtz_x_f_v({{
                             Vd_vwu[i] = f_to_wi<et>(
                                 ftype<et>(Vs2_vu[i + offset]),
                                 softfloat_round_minMag);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                     }
                     format VectorFloatNarrowingCvtFormat {
                         0x10: vfncvt_xu_f_w({{
                             Vd_vu[i + offset] = f_to_nui<vu>(
                                 ftype<ewt>(Vs2_vwu[i]),
                                 softfloat_roundingMode);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x11: vfncvt_x_f_w({{
                             Vd_vu[i + offset] = f_to_ni<vu>(
                                 ftype<ewt>(Vs2_vwu[i]),
                                 softfloat_roundingMode);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x12: vfncvt_f_xu_w({{
                             auto fd = ui_to_nf<et>(Vs2_vwu[i]);
                             Vd_vu[i + offset] = fd.v;
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x13: vfncvt_f_x_w({{
                             auto fd = i_to_nf<et>(Vs2_vwu[i]);
                             Vd_vu[i + offset] = fd.v;
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x14: vfncvt_f_f_w({{
                             auto fd = f_to_nf<et>(ftype<ewt>(Vs2_vwu[i]));
                             Vd_vu[i + offset] = fd.v;
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x15: vfncvt_rod_f_f_w({{
                             softfloat_roundingMode = softfloat_round_odd;
                             auto fd = f_to_nf<et>(ftype<ewt>(Vs2_vwu[i]));
                             Vd_vu[i + offset] = fd.v;
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x16: vfncvt_rtz_xu_f_w({{
                             Vd_vu[i + offset] = f_to_nui<vu>(
                                 ftype<ewt>(Vs2_vwu[i]),
                                 softfloat_round_minMag);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                         0x17: vfncvt_rtz_x_f_w({{
                             Vd_vu[i + offset] = f_to_ni<vu>(
                                 ftype<ewt>(Vs2_vwu[i]),
                                 softfloat_round_minMag);
-                        }}, OPFVV, VectorFloatConvertOp);
+                        }}, OPFVV, SimdFloatCvtOp);
                     }
                 }
                 0x13: decode VS1 {
@@ -2825,19 +3207,19 @@ decode QUADRANT default Unknown::unknown() {
                         0x00: vfsqrt_v({{
                             auto fd = fsqrt<et>(ftype<et>(Vs2_vu[i]));
                             Vd_vu[i] = fd.v;
-                        }}, OPFVV, VectorFloatArithOp);
+                        }}, OPFVV, SimdFloatSqrtOp);
                         0x04: vfrsqrt7_v({{
                             auto fd = frsqrte7<et>(ftype<et>(Vs2_vu[i]));
                             Vd_vu[i] = fd.v;
-                        }}, OPFVV, VectorFloatArithOp);
+                        }}, OPFVV, SimdFloatSqrtOp);
                         0x05: vfrec7_v({{
                             auto fd = frecip7<et>(ftype<et>(Vs2_vu[i]));
                             Vd_vu[i] = fd.v;
-                        }}, OPFVV, VectorFloatArithOp);
+                        }}, OPFVV, SimdFloatDivOp);
                         0x10: vfclass_v({{
                             auto fd = fclassify<et>(ftype<et>(Vs2_vu[i]));
                             Vd_vu[i] = fd.v;
-                        }}, OPFVV, VectorFloatArithOp);
+                        }}, OPFVV, SimdMiscOp);
                     }
                 }
 
@@ -2846,82 +3228,82 @@ decode QUADRANT default Unknown::unknown() {
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             feq<et>(ftype<et>(Vs2_vu[i]),
                                     ftype<et>(Vs1_vu[i])));
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatCmpOp);
                     0x19: vmfle_vv({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             fle<et>(ftype<et>(Vs2_vu[i]),
                                     ftype<et>(Vs1_vu[i])));
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatCmpOp);
                     0x1b: vmflt_vv({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             flt<et>(ftype<et>(Vs2_vu[i]),
                                     ftype<et>(Vs1_vu[i])));
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatCmpOp);
                     0x1c: vmfne_vv({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             !feq<et>(ftype<et>(Vs2_vu[i]),
                                     ftype<et>(Vs1_vu[i])));
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatCmpOp);
                 }
                 format VectorFloatFormat {
                     0x20: vfdiv_vv({{
                         auto fd = fdiv<et>(ftype<et>(Vs2_vu[i]),
                                            ftype<et>(Vs1_vu[i]));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatDivOp);
                     0x24: vfmul_vv({{
                         auto fd = fmul<et>(ftype<et>(Vs2_vu[i]),
                                            ftype<et>(Vs1_vu[i]));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultOp);
                     0x28: vfmadd_vv({{
                         auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
                                             ftype<et>(Vs1_vu[i]),
                                             ftype<et>(Vs2_vu[i]));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                     0x29: vfnmadd_vv({{
                         auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
                                             ftype<et>(Vs1_vu[i]),
                                             fneg(ftype<et>(Vs2_vu[i])));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                     0x2a: vfmsub_vv({{
                         auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
                                             ftype<et>(Vs1_vu[i]),
                                             fneg(ftype<et>(Vs2_vu[i])));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                     0x2b: vfnmsub_vv({{
                         auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
                                             ftype<et>(Vs1_vu[i]),
                                             ftype<et>(Vs2_vu[i]));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                     0x2c: vfmacc_vv({{
                         auto fd = fmadd<et>(ftype<et>(Vs1_vu[i]),
                                             ftype<et>(Vs2_vu[i]),
                                             ftype<et>(Vs3_vu[i]));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                     0x2d: vfnmacc_vv({{
                         auto fd = fmadd<et>(fneg(ftype<et>(Vs1_vu[i])),
                                             ftype<et>(Vs2_vu[i]),
                                             fneg(ftype<et>(Vs3_vu[i])));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                     0x2e: vfmsac_vv({{
                         auto fd = fmadd<et>(ftype<et>(Vs1_vu[i]),
                                             ftype<et>(Vs2_vu[i]),
                                             fneg(ftype<et>(Vs3_vu[i])));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                     0x2f: vfnmsac_vv({{
                         auto fd = fmadd<et>(fneg(ftype<et>(Vs1_vu[i])),
                                             ftype<et>(Vs2_vu[i]),
                                             ftype<et>(Vs3_vu[i]));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                     0x31: VectorReduceFloatWideningFormat::vfwredusum_vs({{
                         Vd_vwu[0] = reduce_loop(
                             [](const vwu& src1, const vu& src2) {
@@ -2930,7 +3312,7 @@ decode QUADRANT default Unknown::unknown() {
                                     f_to_wf<et>(ftype<et>(src2))
                                 );
                             }, Vs1_vwu, Vs2_vu);
-                    }}, OPFVV, VectorFloatReduceOp);
+                    }}, OPFVV, SimdFloatReduceAddOp);
                     0x33: VectorReduceFloatWideningFormat::vfwredosum_vs({{
                         Vd_vwu[0] = reduce_loop(
                             [](const vwu& src1, const vu& src2) {
@@ -2939,7 +3321,7 @@ decode QUADRANT default Unknown::unknown() {
                                     f_to_wf<et>(ftype<et>(src2))
                                 );
                             }, Vs1_vwu, Vs2_vu);
-                    }}, OPFVV, VectorFloatReduceOp);
+                    }}, OPFVV, SimdFloatReduceAddOp);
                 }
                 format VectorFloatWideningFormat {
                     0x30: vfwadd_vv({{
@@ -2947,59 +3329,59 @@ decode QUADRANT default Unknown::unknown() {
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             fwiden(ftype<et>(Vs1_vu[i + offset])));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatAddOp);
                     0x32: vfwsub_vv({{
                         auto fd = fsub<ewt>(
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             fwiden(ftype<et>(Vs1_vu[i + offset])));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatAddOp);
                     0x34: vfwadd_wv({{
                         auto fd = fadd<ewt>(
                             ftype<ewt>(Vs2_vwu[i]),
                             fwiden(ftype<et>(Vs1_vu[i + offset])));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatAddOp);
                     0x36: vfwsub_wv({{
                         auto fd = fsub<ewt>(
                             ftype<ewt>(Vs2_vwu[i]),
                             fwiden(ftype<et>(Vs1_vu[i + offset])));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatAddOp);
                     0x38: vfwmul_vv({{
                         auto fd = fmul<ewt>(
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             fwiden(ftype<et>(Vs1_vu[i + offset])));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultOp);
                     0x3c: vfwmacc_vv({{
                         auto fd = fmadd<ewt>(
                             fwiden(ftype<et>(Vs1_vu[i + offset])),
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             ftype<ewt>(Vs3_vwu[i]));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                     0x3d: vfwnmacc_vv({{
                         auto fd = fmadd<ewt>(
                             fwiden(fneg(ftype<et>(Vs1_vu[i + offset]))),
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             fneg(ftype<ewt>(Vs3_vwu[i])));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                     0x3e: vfwmsac_vv({{
                         auto fd = fmadd<ewt>(
                             fwiden(ftype<et>(Vs1_vu[i + offset])),
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             fneg(ftype<ewt>(Vs3_vwu[i])));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                     0x3f: vfwnmsac_vv({{
                         auto fd = fmadd<ewt>(
                             fwiden(fneg(ftype<et>(Vs1_vu[i + offset]))),
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             ftype<ewt>(Vs3_vwu[i]));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVV, VectorFloatArithOp);
+                    }}, OPFVV, SimdFloatMultAccOp);
                 }
             }
             // OPMVV
@@ -3008,65 +3390,65 @@ decode QUADRANT default Unknown::unknown() {
                     0x0: vredsum_vs({{
                         Vd_vi[0] =
                             reduce_loop(std::plus<vi>(), Vs1_vi, Vs2_vi);
-                    }}, OPMVV, VectorIntegerReduceOp);
+                    }}, OPMVV, SimdReduceAddOp);
                     0x1: vredand_vs({{
                         Vd_vi[0] =
                             reduce_loop(std::bit_and<vi>(), Vs1_vi, Vs2_vi);
-                    }}, OPMVV, VectorIntegerReduceOp);
+                    }}, OPMVV, SimdReduceAluOp);
                     0x2: vredor_vs({{
                         Vd_vi[0] =
                             reduce_loop(std::bit_or<vi>(), Vs1_vi, Vs2_vi);
-                    }}, OPMVV, VectorIntegerReduceOp);
+                    }}, OPMVV, SimdReduceAluOp);
                     0x3: vredxor_vs({{
                         Vd_vi[0] =
                             reduce_loop(std::bit_xor<vi>(), Vs1_vi, Vs2_vi);
-                    }}, OPMVV, VectorIntegerReduceOp);
+                    }}, OPMVV, SimdReduceAluOp);
                     0x4: vredminu_vs({{
                         Vd_vu[0] =
                             reduce_loop([](const vu& src1, const vu& src2) {
                                 return std::min<vu>(src1, src2);
                             }, Vs1_vu, Vs2_vu);
-                    }}, OPMVV, VectorIntegerReduceOp);
+                    }}, OPMVV, SimdReduceCmpOp);
                     0x5: vredmin_vs({{
                         Vd_vi[0] =
                             reduce_loop([](const vi& src1, const vi& src2) {
                                 return std::min<vi>(src1, src2);
                             }, Vs1_vi, Vs2_vi);
-                    }}, OPMVV, VectorIntegerReduceOp);
+                    }}, OPMVV, SimdReduceCmpOp);
                     0x6: vredmaxu_vs({{
                         Vd_vu[0] =
                             reduce_loop([](const vu& src1, const vu& src2) {
                                 return std::max<vu>(src1, src2);
                             }, Vs1_vu, Vs2_vu);
-                    }}, OPMVV, VectorIntegerReduceOp);
+                    }}, OPMVV, SimdReduceCmpOp);
                     0x7: vredmax_vs({{
                         Vd_vi[0] =
                             reduce_loop([](const vi& src1, const vi& src2) {
                                 return std::max<vi>(src1, src2);
                             }, Vs1_vi, Vs2_vi);
-                    }}, OPMVV, VectorIntegerReduceOp);
+                    }}, OPMVV, SimdReduceCmpOp);
                 }
                 format VectorIntFormat {
                     0x8: vaaddu_vv({{
                         __uint128_t res = (__uint128_t)Vs2_vu[i] + Vs1_vu[i];
                         res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
                         Vd_vu[i] = res >> 1;
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                     0x9: vaadd_vv({{
                         __uint128_t res = (__uint128_t)Vs2_vi[i] + Vs1_vi[i];
                         res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
                         Vd_vi[i] = res >> 1;
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                     0xa: vasubu_vv({{
                         __uint128_t res = (__uint128_t)Vs2_vu[i] - Vs1_vu[i];
                         res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
                         Vd_vu[i] = res >> 1;
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                     0xb: vasub_vv({{
                         __uint128_t res = (__uint128_t)Vs2_vi[i] - Vs1_vi[i];
                         res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
                         Vd_vi[i] = res >> 1;
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                 }
                 // VWXUNARY0
                 0x10: decode VS1 {
@@ -3075,7 +3457,7 @@ decode QUADRANT default Unknown::unknown() {
                         // (vm=0) of vmv.x.s are reserved.
                         0x1: VectorNonSplitFormat::vmv_x_s({{
                             Rd_ud = Vs2_vi[0];
-                        }}, OPMVV, VectorMiscOp);
+                        }}, OPMVV, SimdMiscOp);
                     }
                     0x10: Vector1Vs1RdMaskFormat::vcpop_m({{
                         uint64_t popcount = 0;
@@ -3089,7 +3471,7 @@ decode QUADRANT default Unknown::unknown() {
                             }
                         }
                         Rd_vu = popcount;
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x11: Vector1Vs1RdMaskFormat::vfirst_m({{
                         int64_t pos = -1;
                         for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
@@ -3105,7 +3487,7 @@ decode QUADRANT default Unknown::unknown() {
                             }
                         }
                         Rd_vu = pos;
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdMiscOp);
                 }
                 0x12: decode VS1 {
                     format VectorIntExtFormat {
@@ -3113,32 +3495,32 @@ decode QUADRANT default Unknown::unknown() {
                             auto offset = (vlen / SEW) * index;
 
                             Vd_vu[i] = Vs2_vextu[i + offset];
-                        }}, OPMVV, VectorIntegerExtensionOp);
+                        }}, OPMVV, SimdExtOp);
                         0x03: vsext_vf8({{
                             auto offset = (vlen / SEW) * index;
 
                             Vd_vi[i] = Vs2_vext[i + offset];
-                        }}, OPMVV, VectorIntegerExtensionOp);
+                        }}, OPMVV, SimdExtOp);
                         0x04: vzext_vf4({{
                             auto offset = (vlen / SEW) * index;
 
                             Vd_vu[i] = Vs2_vextu[i + offset];
-                        }}, OPMVV, VectorIntegerExtensionOp);
+                        }}, OPMVV, SimdExtOp);
                         0x05: vsext_vf4({{
                             auto offset = (vlen / SEW) * index;
 
                             Vd_vi[i] = Vs2_vext[i + offset];
-                        }}, OPMVV, VectorIntegerExtensionOp);
+                        }}, OPMVV, SimdExtOp);
                         0x06: vzext_vf2({{
                             auto offset = (vlen / SEW) * index;
 
                             Vd_vu[i] = Vs2_vextu[i + offset];
-                        }}, OPMVV, VectorIntegerExtensionOp);
+                        }}, OPMVV, SimdExtOp);
                         0x07: vsext_vf2({{
                             auto offset = (vlen / SEW) * index;
 
                             Vd_vi[i] = Vs2_vext[i + offset];
-                        }}, OPMVV, VectorIntegerExtensionOp);
+                        }}, OPMVV, SimdExtOp);
                     }
                 }
                 0x14: decode VS1 {
@@ -3146,244 +3528,187 @@ decode QUADRANT default Unknown::unknown() {
                         bool has_one = false;
                         for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
                             bool vs2_lsb = elem_mask(Vs2_vu, i);
-                            bool do_mask = elem_mask(v0, i);
-                            if(this->vm||(this->vm == 0&&do_mask)){
+                            if (this->vm || elem_mask(v0, i)){
                                 uint64_t res = 0;
                                 if (!has_one && !vs2_lsb) {
                                     res = 1;
-                                } else if(!has_one && vs2_lsb) {
+                                } else if (!has_one && vs2_lsb) {
                                     has_one = true;
                                 }
                                 Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
                             }
                         }
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x02: Vector1Vs1VdMaskFormat::vmsof_m({{
                         bool has_one = false;
                         for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
                             bool vs2_lsb = elem_mask(Vs2_vu, i);
-                            bool do_mask = elem_mask(v0, i);
-                            if(this->vm||(this->vm == 0&&do_mask)){
+                            if (this->vm || elem_mask(v0, i)){
                                 uint64_t res = 0;
-                                if(!has_one && vs2_lsb) {
+                                if (!has_one && vs2_lsb) {
                                     has_one = true;
                                     res = 1;
                                 }
                                 Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
                             }
                         }
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x03: Vector1Vs1VdMaskFormat::vmsif_m({{
                         bool has_one = false;
                         for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
                             bool vs2_lsb = elem_mask(Vs2_vu, i);
-                            bool do_mask = elem_mask(v0, i);
-                            if(this->vm||(this->vm == 0&&do_mask)){
+                            if (this->vm || elem_mask(v0, i)){
                                 uint64_t res = 0;
                                 if (!has_one && !vs2_lsb) {
                                     res = 1;
-                                } else if(!has_one && vs2_lsb) {
+                                } else if (!has_one && vs2_lsb) {
                                     has_one = true;
                                     res = 1;
                                 }
                                 Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
                             }
                         }
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x10: ViotaFormat::viota_m({{
-                        RiscvISAInst::VecRegContainer tmp_s2;
-                        xc->getRegOperand(this, 2,
-                            &tmp_s2);
-                        auto Vs2bit = tmp_s2.as<vu>();
-                        for (uint32_t i = 0; i < this->microVl; i++) {
-                            uint32_t ei = i +
-                                vtype_VLMAX(vtype, vlen, true) *
-                                this->microIdx;
-                            bool vs2_lsb = elem_mask(Vs2bit, ei);
-                            bool do_mask = elem_mask(v0, ei);
-                            bool has_one = false;
-                            if (this->vm || (do_mask && !this->vm)) {
-                                if (vs2_lsb) {
-                                    has_one = true;
-                                }
-                            }
-                            bool use_ori = (!this->vm) && !do_mask;
-                            if(use_ori == false){
-                                Vd_vu[i] = *cnt;
-                            }
-                            if (has_one) {
+                        if (this->vm || elem_mask(v0, ei)) {
+                            Vd_vu[i] = *cnt;
+                            if (elem_mask(Vs2_vu, ei)) {
                                 *cnt = *cnt+1;
                             }
                         }
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x11: VectorIntFormat::vid_v({{
                         Vd_vu[i] = ei;
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdMiscOp);
                 }
                 format VectorMaskFormat {
                     0x18: vmandn_mm({{
                         Vd_ub[i/8] = ASSIGN_VD_BIT(i,
                             elem_mask(Vs2_vu, i) & !elem_mask(Vs1_vu, i));
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x19: vmand_mm({{
                         Vd_ub[i/8] = ASSIGN_VD_BIT(i,
                             elem_mask(Vs2_vu, i) & elem_mask(Vs1_vu, i));
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x1a: vmor_mm({{
                         Vd_ub[i/8] = ASSIGN_VD_BIT(i,
                             elem_mask(Vs2_vu, i) | elem_mask(Vs1_vu, i));
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x1b: vmxor_mm({{
                         Vd_ub[i/8] = ASSIGN_VD_BIT(i,
                             elem_mask(Vs2_vu, i) ^ elem_mask(Vs1_vu, i));
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x1c: vmorn_mm({{
                         Vd_ub[i/8] = ASSIGN_VD_BIT(i,
                             elem_mask(Vs2_vu, i) | !elem_mask(Vs1_vu, i));
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x1d: vmnand_mm({{
                         Vd_ub[i/8] = ASSIGN_VD_BIT(i,
                             !(elem_mask(Vs2_vu, i) & elem_mask(Vs1_vu, i)));
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x1e: vmnor_mm({{
                         Vd_ub[i/8] = ASSIGN_VD_BIT(i,
                             !(elem_mask(Vs2_vu, i) | elem_mask(Vs1_vu, i)));
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                     0x1f: vmxnor_mm({{
                         Vd_ub[i/8] = ASSIGN_VD_BIT(i,
                             !(elem_mask(Vs2_vu, i) ^ elem_mask(Vs1_vu, i)));
-                    }}, OPMVV, VectorMiscOp);
+                    }}, OPMVV, SimdAluOp);
                 }
                 format VectorIntFormat {
                     0x20: vdivu_vv({{
-                        if (Vs1_vu[i] == 0)
-                            Vd_vu[i] = (vu)-1;
-                        else
-                            Vd_vu[i] = Vs2_vu[i] / Vs1_vu[i];
-                    }}, OPMVV, VectorIntegerArithOp);
+                        Vd_vu[i] = divu<vu>(Vs2_vu[i], Vs1_vu[i]);
+                    }}, OPMVV, SimdDivOp);
                     0x21: vdiv_vv({{
-                        if (Vs1_vi[i] == 0)
-                            Vd_vi[i] = -1;
-                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
-                                && Vs1_vi[i] == -1)
-                            Vd_vi[i] = Vs2_vi[i];
-                        else
-                            Vd_vi[i] = Vs2_vi[i] / Vs1_vi[i];
-                    }}, OPMVV, VectorIntegerArithOp);
+                        Vd_vi[i] = div<vi>(Vs2_vi[i], Vs1_vi[i]);
+                    }}, OPMVV, SimdDivOp);
                     0x22: vremu_vv({{
-                        if (Vs1_vu[i] == 0) {
-                            Vd_vu[i] = Vs2_vu[i];
-                        } else {
-                            Vd_vu[i] = Vs2_vu[i] % Vs1_vu[i];
-                        }
-                    }}, OPMVV, VectorIntegerArithOp);
+                        Vd_vu[i] = remu<vu>(Vs2_vu[i], Vs1_vu[i]);
+                    }}, OPMVV, SimdDivOp);
                     0x23: vrem_vv({{
-                        if (Vs1_vi[i] == 0) {
-                            Vd_vi[i] = Vs2_vi[i];
-                        } else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
-                                && Vs1_vi[i] == -1) {
-                            Vd_vi[i] = 0;
-                        } else {
-                            Vd_vi[i] = Vs2_vi[i] % Vs1_vi[i];
-                        }
-                    }}, OPMVV, VectorIntegerArithOp);
+                        Vd_vi[i] = rem<vi>(Vs2_vi[i], Vs1_vi[i]);
+                    }}, OPMVV, SimdDivOp);
                     0x24: vmulhu_vv({{
-                        if (sew < 64) {
-                            Vd_vu[i] = ((uint64_t)Vs2_vu[i] * Vs1_vu[i])
-                                        >> sew;
-                        } else {
-                            Vd_vu[i] = mulhu_64(Vs2_vu[i], Vs1_vu[i]);
-                        }
-                    }}, OPMVV, VectorIntegerArithOp);
+                        Vd_vu[i] = mulhu<vu>(Vs2_vu[i], Vs1_vu[i]);
+                    }}, OPMVV, SimdMultOp);
                     0x25: vmul_vv({{
                         Vd_vi[i] = Vs2_vi[i] * Vs1_vi[i];
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdMultOp);
                     0x26: vmulhsu_vv({{
-                        if (sew < 64) {
-                            Vd_vi[i] = ((int64_t)Vs2_vi[i] *
-                                        (uint64_t)Vs1_vu[i])
-                                        >> sew;
-                        } else {
-                            Vd_vi[i] = mulhsu_64(Vs2_vi[i], Vs1_vu[i]);
-                        }
-                    }}, OPMVV, VectorIntegerArithOp);
+                        Vd_vi[i] = mulhsu<vi>(Vs2_vi[i], Vs1_vu[i]);
+                    }}, OPMVV, SimdMultOp);
                     0x27: vmulh_vv({{
-                        if (sew < 64) {
-                            Vd_vi[i] = ((int64_t)Vs2_vi[i] * Vs1_vi[i])
-                                        >> sew;
-                        } else {
-                            Vd_vi[i] = mulh_64(Vs2_vi[i], Vs1_vi[i]);
-                        }
-                    }}, OPMVV, VectorIntegerArithOp);
+                        Vd_vi[i] = mulh<vi>(Vs2_vi[i], Vs1_vi[i]);
+                    }}, OPMVV, SimdMultOp);
                     0x29: vmadd_vv({{
                         Vd_vi[i] = Vs3_vi[i] * Vs1_vi[i] + Vs2_vi[i];
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdMultAccOp);
                     0x2b: vnmsub_vv({{
                         Vd_vi[i] = -(Vs3_vi[i] * Vs1_vi[i]) + Vs2_vi[i];
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdMultAccOp);
                     0x2d: vmacc_vv({{
                         Vd_vi[i] = Vs2_vi[i] * Vs1_vi[i] + Vs3_vi[i];
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdMultAccOp);
                     0x2f: vnmsac_vv({{
                         Vd_vi[i] = -(Vs2_vi[i] * Vs1_vi[i]) + Vs3_vi[i];
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdMultAccOp);
                 }
                 format VectorIntWideningFormat {
                     0x30: vwaddu_vv({{
                         Vd_vwu[i] = vwu(Vs2_vu[i + offset])
                                 + vwu(Vs1_vu[i + offset]);
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                     0x31: vwadd_vv({{
                         Vd_vwi[i] = vwi(Vs2_vi[i + offset])
                                 + vwi(Vs1_vi[i + offset]);
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                     0x32: vwsubu_vv({{
                         Vd_vwu[i] = vwu(Vs2_vu[i + offset])
                                 - vwu(Vs1_vu[i + offset]);
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                     0x33: vwsub_vv({{
                         Vd_vwi[i] = vwi(Vs2_vi[i + offset])
                                 - vwi(Vs1_vi[i + offset]);
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                     0x34: vwaddu_wv({{
                         Vd_vwu[i] = Vs2_vwu[i] + vwu(Vs1_vu[i + offset]);
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                     0x35: vwadd_wv({{
                         Vd_vwi[i] = Vs2_vwi[i] + vwi(Vs1_vi[i + offset]);
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                     0x36: vwsubu_wv({{
                         Vd_vwu[i] = Vs2_vwu[i] - vwu(Vs1_vu[i + offset]);
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                     0x37: vwsub_wv({{
                         Vd_vwi[i] = Vs2_vwi[i] - vwi(Vs1_vi[i + offset]);
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdAddOp);
                     0x38: vwmulu_vv({{
                         Vd_vwu[i] = vwu(Vs2_vu[i + offset])
                                 * vwu(Vs1_vu[i + offset]);
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdMultOp);
                     0x3a: vwmulsu_vv({{
                         Vd_vwi[i] = vwi(Vs2_vi[i + offset])
                                 * vwu(Vs1_vu[i + offset]);
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdMultOp);
                     0x3b: vwmul_vv({{
                         Vd_vwi[i] = vwi(Vs2_vi[i + offset])
                                 * vwi(Vs1_vi[i + offset]);
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdMultOp);
                     0x3c: vwmaccu_vv({{
                         Vd_vwu[i] = vwu(Vs1_vu[i + offset])
                                 * vwu(Vs2_vu[i + offset])
                                 + Vs3_vwu[i];
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdMultAccOp);
                     0x3d: vwmacc_vv({{
                         Vd_vwi[i] = vwi(Vs1_vi[i + offset])
                                 * vwi(Vs2_vi[i + offset])
                                 + Vs3_vwi[i];
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdMultAccOp);
                     0x3f: vwmaccsu_vv({{
                         Vd_vwi[i] = vwi(Vs1_vi[i + offset])
                                 * vwu(Vs2_vu[i + offset])
                                 + Vs3_vwi[i];
-                    }}, OPMVV, VectorIntegerArithOp);
+                    }}, OPMVV, SimdMultAccOp);
                 }
             }
             // OPIVI
@@ -3391,32 +3716,32 @@ decode QUADRANT default Unknown::unknown() {
                 format VectorIntFormat {
                     0x00: vadd_vi({{
                         Vd_vi[i] = Vs2_vi[i] + (vi)sext<5>(SIMM5);
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdAddOp);
                     0x03: vrsub_vi({{
                         Vd_vi[i] = (vi)sext<5>(SIMM5) - Vs2_vi[i];
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdAddOp);
                     0x09: vand_vi({{
                         Vd_vi[i] = Vs2_vi[i] & (vi)sext<5>(SIMM5);
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdAluOp);
                     0x0a: vor_vi({{
                         Vd_vi[i] = Vs2_vi[i] | (vi)sext<5>(SIMM5);
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdAluOp);
                     0x0b: vxor_vi({{
                         Vd_vi[i] = Vs2_vi[i] ^ (vi)sext<5>(SIMM5);
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdAluOp);
                 }
                 0x0c: VectorGatherFormat::vrgather_vi({{
                     for (uint32_t i = 0; i < microVl; i++) {
                         uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        uint64_t zextImm = rvZext(SIMM5);
                         if (this->vm || elem_mask(v0, ei)) {
-                            const uint64_t idx =
-                                (uint64_t)sext<5>(SIMM5) - vs2_elems * vs2_idx;
-                            Vd_vu[i] = ((uint64_t)sext<5>(SIMM5) >= vlmax) ? 0
+                            const uint64_t idx = zextImm - vs2_elems * vs2_idx;
+                            Vd_vu[i] = (zextImm >= vlmax) ? 0
                                 : (idx < vs2_elems) ? Vs2_vu[idx]
                                 : Vs3_vu[i];
                         }
                     }
-                }}, OPIVI, VectorMiscOp);
+                }}, OPIVI, SimdMiscOp);
                 0x0e: VectorSlideUpFormat::vslideup_vi({{
                     const int offset = (int)(uint64_t)(SIMM5);
                     const int microVlmax = vtype_VLMAX(machInst.vtype8,
@@ -3442,7 +3767,7 @@ decode QUADRANT default Unknown::unknown() {
                             }
                         }
                     }
-                }}, OPIVI, VectorMiscOp);
+                }}, OPIVI, SimdMiscOp);
                 0x0f: VectorSlideDownFormat::vslidedown_vi({{
                     const int offset = (int)(uint64_t)(SIMM5);
                     const int microVlmax = vtype_VLMAX(machInst.vtype8,
@@ -3481,13 +3806,13 @@ decode QUADRANT default Unknown::unknown() {
                             }
                         }
                     }
-                }}, OPIVI, VectorMiscOp);
+                }}, OPIVI, SimdMiscOp);
                 format VectorIntFormat {
                     0x10: decode VM {
                         0x0: vadc_vim({{
                             Vd_vi[i] = Vs2_vi[i] +
                                 (vi)sext<5>(SIMM5) + elem_mask(v0, ei);
-                        }}, OPIVI, VectorIntegerArithOp);
+                        }}, OPIVI, SimdAddOp);
                         // the unmasked versions (vm=1) are reserved
                     }
                     0x17: decode VM {
@@ -3495,29 +3820,29 @@ decode QUADRANT default Unknown::unknown() {
                             Vd_vi[i] = elem_mask(v0, ei)
                                     ? (vi)sext<5>(SIMM5)
                                     : Vs2_vi[i];
-                        }}, OPIVI, VectorIntegerArithOp);
+                        }}, OPIVI, SimdMiscOp);
                         0x1: vmv_v_i({{
                             Vd_vi[i] = (vi)sext<5>(SIMM5);
-                        }}, OPIVI, VectorIntegerArithOp);
+                        }}, OPIVI, SimdMiscOp);
                     }
                 }
                 format VectorIntVxsatFormat{
                     0x20: vsaddu_vi({{
-                        Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], (vu)SIMM5,
+                        Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], (vu)sext<5>(SIMM5),
                             vxsatptr);
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdAddOp);
                     0x21: vsadd_vi({{
-                        Vd_vu[i] = sat_add<vi>(Vs2_vu[i], (vu)SIMM5,
+                        Vd_vi[i] = sat_add<vi>(Vs2_vi[i], (vi)sext<5>(SIMM5),
                             vxsatptr);
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdAddOp);
                 }
                 format VectorIntFormat {
                     0x25: vsll_vi({{
                         Vd_vu[i] = Vs2_vu[i] << ((vu)SIMM5 & (sew - 1) & 0x1f);
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdShiftOp);
                     0x28: vsrl_vi({{
                         Vd_vu[i] = Vs2_vu[i] >> ((vu)SIMM5 & (sew - 1) & 0x1f);
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdShiftOp);
                     0x2a: vssrl_vi({{
                         int sh = SIMM5 & (vtype_SEW(vtype) - 1);
                         __uint128_t res = Vs2_vu[i];
@@ -3526,10 +3851,10 @@ decode QUADRANT default Unknown::unknown() {
                             res, 0 /* TODO */, sh) >> sh;
 
                         Vd_vu[i] = res;
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdShiftOp);
                     0x29: vsra_vi({{
                         Vd_vi[i] = Vs2_vi[i] >> ((vu)SIMM5 & (sew - 1) & 0x1f);
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdShiftOp);
                     0x2b: vssra_vi({{
                         int sh = SIMM5 & (sew - 1);
                         __int128_t val = Vs2_vi[i];
@@ -3537,7 +3862,7 @@ decode QUADRANT default Unknown::unknown() {
                         val = int_rounding<__int128_t>(val,
                             xc->readMiscReg(MISCREG_VXRM), sh);
                         Vd_vi[i] = val >> sh;
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdShiftOp);
                 }
                 // According to Spec Section 16.6,
                 // vm must be 1 (unmasked) in vmv<nr>r.v instructions.
@@ -3545,16 +3870,16 @@ decode QUADRANT default Unknown::unknown() {
                     format VMvWholeFormat {
                         0x0: vmv1r_v({{
                             Vd_ud[i] = Vs2_ud[i];
-                        }}, OPIVI, VectorMiscOp);
+                        }}, OPIVI, SimdMiscOp);
                         0x1: vmv2r_v({{
                             Vd_ud[i] = Vs2_ud[i];
-                        }}, OPIVI, VectorMiscOp);
+                        }}, OPIVI, SimdMiscOp);
                         0x3: vmv4r_v({{
                             Vd_ud[i] = Vs2_ud[i];
-                        }}, OPIVI, VectorMiscOp);
+                        }}, OPIVI, SimdMiscOp);
                         0x7: vmv8r_v({{
                             Vd_ud[i] = Vs2_ud[i];
-                        }}, OPIVI, VectorMiscOp);
+                        }}, OPIVI, SimdMiscOp);
                     }
                 }}
                 format VectorIntMaskFormat {
@@ -3563,46 +3888,46 @@ decode QUADRANT default Unknown::unknown() {
                             Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                                 carry_out(Vs2_vi[i], (vi)sext<5>(SIMM5),
                                     elem_mask(v0, ei)));
-                        }}, OPIVI, VectorIntegerArithOp);
+                        }}, OPIVI, SimdAddOp);
                         0x1: vmadc_vi({{
                             Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                                 carry_out(Vs2_vi[i], (vi)sext<5>(SIMM5)));
-                        }}, OPIVI, VectorIntegerArithOp);
+                        }}, OPIVI, SimdAddOp);
                     }
                     0x18: vmseq_vi({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vi[i] == (vi)sext<5>(SIMM5)));
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdCmpOp);
                     0x19: vmsne_vi({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vi[i] != (vi)sext<5>(SIMM5)));
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdCmpOp);
                     0x1c: vmsleu_vi({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vu[i] <= (vu)sext<5>(SIMM5)));
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdCmpOp);
                     0x1d: vmsle_vi({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vi[i] <= (vi)sext<5>(SIMM5)));
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdCmpOp);
                     0x1e: vmsgtu_vi({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vu[i] > (vu)sext<5>(SIMM5)));
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdCmpOp);
                     0x1f: vmsgt_vi({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vi[i] > (vi)sext<5>(SIMM5)));
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdCmpOp);
                 }
                 format VectorIntNarrowingFormat {
                     0x2c: vnsrl_wi({{
                         Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
                                             ((vwu)SIMM5 & (sew * 2 - 1)));
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdShiftOp);
                     0x2d: vnsra_wi({{
                         Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
                                             ((vwu)SIMM5 & (sew * 2 - 1)));
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdShiftOp);
                     0x2e: vnclipu_wi({{
                         vu max = std::numeric_limits<vu>::max();
                         uint64_t sign_mask =
@@ -3619,7 +3944,7 @@ decode QUADRANT default Unknown::unknown() {
                         }
 
                         Vd_vu[i + offset] = (vu)res;
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdCvtOp);
                     0x2f: vnclip_wi({{
                         vi max = std::numeric_limits<vi>::max();
                         vi min = std::numeric_limits<vi>::min();
@@ -3638,7 +3963,7 @@ decode QUADRANT default Unknown::unknown() {
                         }
 
                         Vd_vi[i + offset] = (vi)res;
-                    }}, OPIVI, VectorIntegerArithOp);
+                    }}, OPIVI, SimdCvtOp);
                 }
             }
             // OPIVX
@@ -3646,34 +3971,34 @@ decode QUADRANT default Unknown::unknown() {
                 format VectorIntFormat {
                     0x0: vadd_vx({{
                         Vd_vu[i] = Vs2_vu[i] + Rs1_vu;
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdAddOp);
                     0x2: vsub_vx({{
                         Vd_vu[i] = Vs2_vu[i] - Rs1_vu;
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdAddOp);
                     0x3: vrsub_vx({{
                         Vd_vu[i] = Rs1_vu - Vs2_vu[i];
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdAddOp);
                     0x4: vminu_vx({{
                         Vd_vu[i] = std::min(Vs2_vu[i], Rs1_vu);
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                     0x5: vmin_vx({{
                         Vd_vi[i] = std::min(Vs2_vi[i], Rs1_vi);
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                     0x6: vmaxu_vx({{
                         Vd_vu[i] = std::max(Vs2_vu[i], Rs1_vu);
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                     0x7: vmax_vx({{
                         Vd_vi[i] = std::max(Vs2_vi[i], Rs1_vi);
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                     0x9: vand_vx({{
                         Vd_vu[i] = Vs2_vu[i] & Rs1_vu;
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdAluOp);
                     0xa: vor_vx({{
                         Vd_vu[i] = Vs2_vu[i] | Rs1_vu;
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdAluOp);
                     0xb: vxor_vx({{
                         Vd_vu[i] = Vs2_vu[i] ^ Rs1_vu;
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdAluOp);
                 }
                 0x0e: VectorSlideUpFormat::vslideup_vx({{
                     const int offset = (int)Rs1_vu;
@@ -3700,7 +4025,7 @@ decode QUADRANT default Unknown::unknown() {
                             }
                         }
                     }
-                }}, OPIVX, VectorMiscOp);
+                }}, OPIVX, SimdMiscOp);
                 0x0f: VectorSlideDownFormat::vslidedown_vx({{
                     const int offset = (int)Rs1_vu;
                     const int microVlmax = vtype_VLMAX(machInst.vtype8,
@@ -3739,39 +4064,40 @@ decode QUADRANT default Unknown::unknown() {
                             }
                         }
                     }
-                }}, OPIVX, VectorMiscOp);
+                }}, OPIVX, SimdMiscOp);
                 0x0c: VectorGatherFormat::vrgather_vx({{
                     for (uint32_t i = 0; i < microVl; i++) {
                         uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        uint64_t zextRs1 = rvZext(Rs1);
                         if (this->vm || elem_mask(v0, ei)) {
-                            const uint64_t idx = Rs1_vu - vs2_elems * vs2_idx;
-                            Vd_vu[i] = (Rs1_vu >= vlmax) ? 0
+                            const uint64_t idx = zextRs1 - vs2_elems * vs2_idx;
+                            Vd_vu[i] = (zextRs1 >= vlmax) ? 0
                                 : (idx < vs2_elems) ? Vs2_vu[idx]
                                 : Vs3_vu[i];
                         }
                     }
-                }}, OPIVX, VectorMiscOp);
+                }}, OPIVX, SimdMiscOp);
                 format VectorIntFormat {
                     0x10: decode VM {
                         0x0: vadc_vxm({{
                             Vd_vi[i] = Vs2_vi[i] + Rs1_vi + elem_mask(v0, ei);
-                        }}, OPIVX, VectorIntegerArithOp);
+                        }}, OPIVX, SimdAddOp);
                         // the unmasked versions (vm=1) are reserved
                     }
                     0x12: decode VM {
                         0x0: vsbc_vxm({{
                             Vd_vi[i] = Vs2_vi[i] - Rs1_vi - elem_mask(v0, ei);
-                        }}, OPIVX, VectorIntegerArithOp);
+                        }}, OPIVX, SimdAddOp);
                         // the unmasked versions (vm=1) are reserved
                     }
                     0x17: decode VM {
                         0x0: vmerge_vxm({{
                             Vd_vu[i] = elem_mask(v0, ei) ? Rs1_vu : Vs2_vu[i];
-                        }}, OPIVX, VectorIntegerArithOp);
+                        }}, OPIVX, SimdMiscOp);
                         0x1: decode VS2 {
                             0x0: vmv_v_x({{
                                 Vd_vu[i] = Rs1_vu;
-                            }}, OPIVX, VectorIntegerArithOp);
+                            }}, OPIVX, SimdMiscOp);
                         }
                     }
                 }
@@ -3779,19 +4105,19 @@ decode QUADRANT default Unknown::unknown() {
                     0x20: vsaddu_vx({{
                         Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], Rs1_vu,
                             vxsatptr);
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdAddOp);
                     0x21: vsadd_vx({{
                         Vd_vu[i] = sat_add<vi>(Vs2_vu[i], Rs1_vu,
                             vxsatptr);
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdAddOp);
                     0x22: vssubu_vx({{
                         Vd_vu[i] = sat_subu<vu>(Vs2_vu[i], Rs1_vu,
                             vxsatptr);
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdAddOp);
                     0x23: vssub_vx({{
                         Vd_vu[i] = sat_sub<vi>(Vs2_vu[i], Rs1_vu,
                             vxsatptr);
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdAddOp);
                     0x27: vsmul_vx({{
                         vi max = std::numeric_limits<vi>::max();
                         vi min = std::numeric_limits<vi>::min();
@@ -3807,18 +4133,18 @@ decode QUADRANT default Unknown::unknown() {
                         }
 
                         Vd_vi[i] = (vi)result;
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdMultOp);
                 }
                 format VectorIntFormat {
                     0x25: vsll_vx({{
                         Vd_vu[i] = Vs2_vu[i] << (Rs1_vu & (sew - 1));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdShiftOp);
                     0x28: vsrl_vx({{
                         Vd_vu[i] = Vs2_vu[i] >> (Rs1_vu & (sew - 1));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdShiftOp);
                     0x29: vsra_vx({{
                         Vd_vi[i] = Vs2_vi[i] >> (Rs1_vu & (sew - 1));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdShiftOp);
                     0x2a: vssrl_vx({{
                         int sh = Rs1_vu & (sew - 1);
                         __uint128_t val = Vs2_vu[i];
@@ -3826,7 +4152,7 @@ decode QUADRANT default Unknown::unknown() {
                         val = int_rounding<__uint128_t>(val,
                             xc->readMiscReg(MISCREG_VXRM), sh);
                         Vd_vu[i] = val >> sh;
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdShiftOp);
                     0x2b: vssra_vx({{
                         int sh = Rs1_vu & (sew - 1);
                         __int128_t val = Vs2_vi[i];
@@ -3834,17 +4160,17 @@ decode QUADRANT default Unknown::unknown() {
                         val = int_rounding<__int128_t>(val,
                             xc->readMiscReg(MISCREG_VXRM), sh);
                         Vd_vi[i] = val >> sh;
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdShiftOp);
                 }
                 format VectorIntNarrowingFormat {
                     0x2c: vnsrl_wx({{
                         Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
                                             ((vwu)Rs1_vu & (sew * 2 - 1)));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdShiftOp);
                     0x2d: vnsra_wx({{
                         Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
                                             ((vwu)Rs1_vu & (sew * 2 - 1)));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdShiftOp);
                     0x2e: vnclipu_wx({{
                         vu max = std::numeric_limits<vu>::max();
                         uint64_t sign_mask =
@@ -3861,7 +4187,7 @@ decode QUADRANT default Unknown::unknown() {
                         }
 
                         Vd_vu[i + offset] = (vu)res;
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCvtOp);
                     0x2f: vnclip_wx({{
                         vi max = std::numeric_limits<vi>::max();
                         vi min = std::numeric_limits<vi>::min();
@@ -3880,7 +4206,7 @@ decode QUADRANT default Unknown::unknown() {
                         }
 
                         Vd_vi[i + offset] = (vi)res;
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCvtOp);
                 }
 
                 format VectorIntMaskFormat {
@@ -3889,55 +4215,55 @@ decode QUADRANT default Unknown::unknown() {
                             Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                                 carry_out(Vs2_vi[i], Rs1_vi,
                                     elem_mask(v0, ei)));
-                        }}, OPIVX, VectorIntegerArithOp);
+                        }}, OPIVX, SimdAddOp);
                         0x1: vmadc_vx({{
                             Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                                 carry_out(Vs2_vi[i], Rs1_vi));
-                        }}, OPIVX, VectorIntegerArithOp);
+                        }}, OPIVX, SimdAddOp);
                     }
                     0x13: decode VM {
                         0x0: vmsbc_vxm({{
                             Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                                 borrow_out(Vs2_vi[i], Rs1_vi,
                                     elem_mask(v0, ei)));
-                        }}, OPIVX, VectorIntegerArithOp);
+                        }}, OPIVX, SimdAddOp);
                         0x1: vmsbc_vx({{
                             Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                                 borrow_out(Vs2_vi[i], Rs1_vi));
-                        }}, OPIVX, VectorIntegerArithOp);
+                        }}, OPIVX, SimdAddOp);
                     }
                     0x18: vmseq_vx({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vu[i] == Rs1_vu));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                     0x19: vmsne_vx({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vu[i] != Rs1_vu));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                     0x1a: vmsltu_vx({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vu[i] < Rs1_vu));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                     0x1b: vmslt_vx({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vi[i] < Rs1_vi));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                     0x1c: vmsleu_vx({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vu[i] <= Rs1_vu));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                     0x1d: vmsle_vx({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vi[i] <= Rs1_vi));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                     0x1e: vmsgtu_vx({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vu[i] > Rs1_vu));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                     0x1f: vmsgt_vx({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             (Vs2_vi[i] > Rs1_vi));
-                    }}, OPIVX, VectorIntegerArithOp);
+                    }}, OPIVX, SimdCmpOp);
                 }
             }
             // OPFVF
@@ -3947,37 +4273,37 @@ decode QUADRANT default Unknown::unknown() {
                         auto fd = fadd<et>(ftype<et>(Vs2_vu[i]),
                                            ftype_freg<et>(freg(Fs1_bits)));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatAddOp);
                     0x02: vfsub_vf({{
                         auto fd = fsub<et>(ftype<et>(Vs2_vu[i]),
                                            ftype_freg<et>(freg(Fs1_bits)));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatAddOp);
                     0x04: vfmin_vf({{
                         auto fd = fmin<et>(ftype<et>(Vs2_vu[i]),
                                            ftype_freg<et>(freg(Fs1_bits)));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatCmpOp);
                     0x06: vfmax_vf({{
                         auto fd = fmax<et>(ftype<et>(Vs2_vu[i]),
                                            ftype_freg<et>(freg(Fs1_bits)));
                             Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatCmpOp);
                     0x08: vfsgnj_vf({{
                         Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
                                              ftype_freg<et>(freg(Fs1_bits)),
                                              false, false).v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatExtOp);
                     0x09: vfsgnjn_vf({{
                         Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
                                              ftype_freg<et>(freg(Fs1_bits)),
                                              true, false).v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatExtOp);
                     0x0a: vfsgnjx_vf({{
                         Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
                                              ftype_freg<et>(freg(Fs1_bits)),
                                              false, true).v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatExtOp);
                 }
                 0x0e: VectorFloatSlideUpFormat::vfslide1up_vf({{
                     const int offset = 1;
@@ -4009,7 +4335,7 @@ decode QUADRANT default Unknown::unknown() {
                             tmp_d0.as<vu>()[0] = Rs1_vu;
                         }
                     }
-                }}, OPFVF, VectorMiscOp);
+                }}, OPFVF, SimdMiscOp);
                 0x0f: VectorFloatSlideDownFormat::vfslide1down_vf({{
                     const int offset = 1;
                     const int microVlmax = vtype_VLMAX(machInst.vtype8,
@@ -4050,16 +4376,18 @@ decode QUADRANT default Unknown::unknown() {
                             }
                         }
                     }
-                }}, OPFVF, VectorMiscOp);
+                }}, OPFVF, SimdMiscOp);
                 // VRFUNARY0
                 0x10: decode VS2 {
                     0x00: decode VM {
                         // The encodings corresponding to the masked versions
                         // (vm=0) of vfmv.s.f are reserved
                         0x1: VectorNonSplitFormat::vfmv_s_f({{
-                            auto fd = ftype_freg<et>(freg(Fs1_bits));
-                            Vd_vu[0] = fd.v;
-                        }}, OPFVV, VectorMiscOp);
+                            if (this->vl) {
+                                auto fd = ftype_freg<et>(freg(Fs1_bits));
+                                Vd_vu[0] = fd.v;
+                            }
+                        }}, OPFVV, SimdMiscOp);
                     }
                 }
                 format VectorFloatFormat{
@@ -4068,11 +4396,11 @@ decode QUADRANT default Unknown::unknown() {
                             Vd_vu[i] = elem_mask(v0, ei)
                                     ? ftype_freg<et>(freg(Fs1_bits)).v
                                     : Vs2_vu[i];
-                        }}, OPFVF, VectorFloatArithOp);
+                        }}, OPFVF, SimdMiscOp);
                         0x1: vfmv_v_f({{
                             auto fd = ftype_freg<et>(freg(Fs1_bits));
                             Vd_vu[i] = fd.v;
-                        }}, OPFVF, VectorFloatArithOp);
+                        }}, OPFVF, SimdMiscOp);
                     }
                 }
                 format VectorFloatMaskFormat {
@@ -4080,84 +4408,84 @@ decode QUADRANT default Unknown::unknown() {
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             feq<et>(ftype<et>(Vs2_vu[i]),
                                     ftype_freg<et>(freg(Fs1_bits))));
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatCmpOp);
                     0x19: vmfle_vf({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             fle<et>(ftype<et>(Vs2_vu[i]),
                                     ftype_freg<et>(freg(Fs1_bits))));
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatCmpOp);
                     0x1b: vmflt_vf({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             flt<et>(ftype<et>(Vs2_vu[i]),
                                     ftype_freg<et>(freg(Fs1_bits))));
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatCmpOp);
                     0x1c: vmfne_vf({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             !feq<et>(ftype<et>(Vs2_vu[i]),
                                      ftype_freg<et>(freg(Fs1_bits))));
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatCmpOp);
                     0x1d: vmfgt_vf({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             flt<et>(ftype_freg<et>(freg(Fs1_bits)),
                                     ftype<et>(Vs2_vu[i])));
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatCmpOp);
                     0x1f: vmfge_vf({{
                         Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
                             fle<et>(ftype_freg<et>(freg(Fs1_bits)),
                                     ftype<et>(Vs2_vu[i])));
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatCmpOp);
                 }
                 format VectorFloatFormat{
                     0x20: vfdiv_vf({{
                         auto fd = fdiv<et>(ftype<et>(Vs2_vu[i]),
                                            ftype_freg<et>(freg(Fs1_bits)));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatDivOp);
                     0x21: vfrdiv_vf({{
                         auto fd = fdiv<et>(ftype_freg<et>(freg(Fs1_bits)),
                                            ftype<et>(Vs2_vu[i]));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatDivOp);
                     0x24: vfmul_vf({{
                         auto fd = fmul<et>(ftype<et>(Vs2_vu[i]),
                                            ftype_freg<et>(freg(Fs1_bits)));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultOp);
                     0x27: vfrsub_vf({{
                         auto fd = fsub<et>(ftype_freg<et>(freg(Fs1_bits)),
                                            ftype<et>(Vs2_vu[i]));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatAddOp);
                     0x28: vfmadd_vf({{
                         auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
                                             ftype_freg<et>(freg(Fs1_bits)),
                                             ftype<et>(Vs2_vu[i]));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                     0x29: vfnmadd_vf({{
                         auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
                                             ftype_freg<et>(freg(Fs1_bits)),
                                             fneg(ftype<et>(Vs2_vu[i])));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                     0x2a: vfmsub_vf({{
                         auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
                                             ftype_freg<et>(freg(Fs1_bits)),
                                             fneg(ftype<et>(Vs2_vu[i])));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                     0x2b: vfnmsub_vf({{
                         auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
                                             ftype_freg<et>(freg(Fs1_bits)),
                                             ftype<et>(Vs2_vu[i]));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                     0x2c: vfmacc_vf({{
                         auto fd = fmadd<et>(ftype_freg<et>(freg(Fs1_bits)),
                                             ftype<et>(Vs2_vu[i]),
                                             ftype<et>(Vs3_vu[i]));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                     0x2d: vfnmacc_vf({{
                         auto fd = fmadd<et>(
                             fneg(ftype_freg<et>(freg(Fs1_bits))),
@@ -4165,13 +4493,13 @@ decode QUADRANT default Unknown::unknown() {
                             fneg(ftype<et>(Vs3_vu[i]))
                         );
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                     0x2e: vfmsac_vf({{
                         auto fd = fmadd<et>(ftype_freg<et>(freg(Fs1_bits)),
                                             ftype<et>(Vs2_vu[i]),
                                             fneg(ftype<et>(Vs3_vu[i])));
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                     0x2f: vfnmsac_vf({{
                         auto fd = fmadd<et>(
                             fneg(ftype_freg<et>(freg(Fs1_bits))),
@@ -4179,7 +4507,7 @@ decode QUADRANT default Unknown::unknown() {
                             ftype<et>(Vs3_vu[i])
                         );
                         Vd_vu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                 }
                 format VectorFloatWideningFormat {
                     0x30: vfwadd_vf({{
@@ -4187,59 +4515,59 @@ decode QUADRANT default Unknown::unknown() {
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             fwiden(ftype_freg<et>(freg(Fs1_bits))));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatAddOp);
                     0x32: vfwsub_vf({{
                         auto fd = fsub<ewt>(
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             fwiden(ftype_freg<et>(freg(Fs1_bits))));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatAddOp);
                     0x34: vfwadd_wf({{
                         auto fd = fadd<ewt>(
                             ftype<ewt>(Vs2_vwu[i]),
                             fwiden(ftype_freg<et>(freg(Fs1_bits))));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatAddOp);
                     0x36: vfwsub_wf({{
                         auto fd = fsub<ewt>(
                             ftype<ewt>(Vs2_vwu[i]),
                             fwiden(ftype_freg<et>(freg(Fs1_bits))));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatAddOp);
                     0x38: vfwmul_vf({{
                         auto fd = fmul<ewt>(
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             fwiden(ftype_freg<et>(freg(Fs1_bits))));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultOp);
                     0x3c: vfwmacc_vf({{
                         auto fd = fmadd<ewt>(
                             fwiden(ftype_freg<et>(freg(Fs1_bits))),
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             ftype<ewt>(Vs3_vwu[i]));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                     0x3d: vfwnmacc_vf({{
                         auto fd = fmadd<ewt>(
                             fwiden(fneg(ftype_freg<et>(freg(Fs1_bits)))),
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             fneg(ftype<ewt>(Vs3_vwu[i])));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                     0x3e: vfwmsac_vf({{
                         auto fd = fmadd<ewt>(
                             fwiden(ftype_freg<et>(freg(Fs1_bits))),
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             fneg(ftype<ewt>(Vs3_vwu[i])));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                     0x3f: vfwnmsac_vf({{
                         auto fd = fmadd<ewt>(
                             fwiden(fneg(ftype_freg<et>(freg(Fs1_bits)))),
                             fwiden(ftype<et>(Vs2_vu[i + offset])),
                             ftype<ewt>(Vs3_vwu[i]));
                         Vd_vwu[i] = fd.v;
-                    }}, OPFVF, VectorFloatArithOp);
+                    }}, OPFVF, SimdFloatMultAccOp);
                 }
             }
             // OPMVX
@@ -4249,12 +4577,12 @@ decode QUADRANT default Unknown::unknown() {
                         __uint128_t res = (__uint128_t)Vs2_vu[i] + Rs1_vu;
                         res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
                         Vd_vu[i] = res >> 1;
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                     0x09: vaadd_vx({{
                         __uint128_t res = (__uint128_t)Vs2_vi[i] + Rs1_vi;
                         res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
                         Vd_vi[i] = res >> 1;
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                 }
                 0x0e: VectorSlideUpFormat::vslide1up_vx({{
                     const int offset = 1;
@@ -4286,7 +4614,7 @@ decode QUADRANT default Unknown::unknown() {
                             tmp_d0.as<vu>()[0] = Rs1_vu;
                         }
                     }
-                }}, OPIVX, VectorMiscOp);
+                }}, OPIVX, SimdMiscOp);
                 0x0f: VectorSlideDownFormat::vslide1down_vx({{
                     const int offset = 1;
                     const int microVlmax = vtype_VLMAX(machInst.vtype8,
@@ -4327,15 +4655,17 @@ decode QUADRANT default Unknown::unknown() {
                             }
                         }
                     }
-                }}, OPIVX, VectorMiscOp);
+                }}, OPIVX, SimdMiscOp);
                 // VRXUNARY0
                 0x10: decode VS2 {
                     0x00: decode VM {
                         // The encodings corresponding to the masked versions
                         // (vm=0) of vmv.s.x are reserved.
                         0x1: VectorNonSplitFormat::vmv_s_x({{
-                            Vd_vu[0] = Rs1_vu;
-                        }}, OPMVX, VectorMiscOp);
+                            if (this->vl) {
+                                Vd_vu[0] = Rs1_vu;
+                            }
+                        }}, OPMVX, SimdMiscOp);
                     }
                 }
                 format VectorIntFormat {
@@ -4343,130 +4673,99 @@ decode QUADRANT default Unknown::unknown() {
                         __uint128_t res = (__uint128_t)Vs2_vu[i] - Rs1_vu;
                         res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
                         Vd_vu[i] = res >> 1;
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                     0x0b: vasub_vx({{
                         __uint128_t res = (__uint128_t)Vs2_vi[i] - Rs1_vi;
                         res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
                         Vd_vi[i] = res >> 1;
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                     0x20: vdivu_vx({{
-                        if (Rs1_vu == 0)
-                            Vd_vu[i] = (vu)-1;
-                        else
-                            Vd_vu[i] = Vs2_vu[i] / Rs1_vu;
-                    }}, OPMVX, VectorIntegerArithOp);
+                        Vd_vu[i] = divu<vu>(Vs2_vu[i], Rs1_vu);
+                    }}, OPMVX, SimdDivOp);
                     0x21: vdiv_vx({{
-                        if (Rs1_vi == 0)
-                            Vd_vi[i] = -1;
-                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
-                                && Rs1_vi == -1)
-                            Vd_vi[i] = Vs2_vi[i];
-                        else
-                            Vd_vi[i] = Vs2_vi[i] / Rs1_vi;
-                    }}, OPMVX, VectorIntegerArithOp);
+                        Vd_vi[i] = div<vi>(Vs2_vi[i], Rs1_vi);
+                    }}, OPMVX, SimdDivOp);
                     0x22: vremu_vx({{
-                        if (Rs1_vu == 0)
-                            Vd_vu[i] = Vs2_vu[i];
-                        else
-                            Vd_vu[i] = Vs2_vu[i] % Rs1_vu;
-                    }}, OPMVX, VectorIntegerArithOp);
+                        Vd_vu[i] = remu<vu>(Vs2_vu[i], Rs1_vu);
+                    }}, OPMVX, SimdDivOp);
                     0x23: vrem_vx({{
-                        if (Rs1_vi == 0)
-                            Vd_vi[i] = Vs2_vi[i];
-                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
-                                && Rs1_vi == -1)
-                            Vd_vi[i] = 0;
-                        else
-                            Vd_vi[i] = Vs2_vi[i] % Rs1_vi;
-                    }}, OPMVX, VectorIntegerArithOp);
+                        Vd_vi[i] = rem<vi>(Vs2_vi[i], Rs1_vi);
+                    }}, OPMVX, SimdDivOp);
                     0x24: vmulhu_vx({{
-                        if (sew < 64)
-                            Vd_vu[i] = ((uint64_t)Vs2_vu[i] * Rs1_vu)
-                                        >> sew;
-                        else
-                            Vd_vu[i] = mulhu_64(Vs2_vu[i], Rs1_vu);
-                    }}, OPMVX, VectorIntegerArithOp);
+                        Vd_vu[i] = mulhu<vu>(Vs2_vu[i], Rs1_vu);
+                    }}, OPMVX, SimdMultOp);
                     0x25: vmul_vx({{
                         Vd_vi[i] = Vs2_vi[i] * Rs1_vi;
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultOp);
                     0x26: vmulhsu_vx({{
-                        if (sew < 64)
-                            Vd_vi[i] = ((int64_t)Vs2_vi[i] *
-                                        (uint64_t)Rs1_vu)
-                                        >> sew;
-                        else
-                            Vd_vi[i] = mulhsu_64(Vs2_vi[i], Rs1_vu);
-                    }}, OPMVX, VectorIntegerArithOp);
+                        Vd_vi[i] = mulhsu<vi>(Vs2_vi[i], Rs1_vu);
+                    }}, OPMVX, SimdMultOp);
                     0x27: vmulh_vx({{
-                        if (sew < 64)
-                            Vd_vi[i] = ((int64_t)Vs2_vi[i] * Rs1_vi)
-                                        >> sew;
-                        else
-                            Vd_vi[i] = mulh_64(Vs2_vi[i], Rs1_vi);
-                    }}, OPMVX, VectorIntegerArithOp);
+                        Vd_vi[i] = mulh<vi>(Vs2_vi[i], Rs1_vi);
+                    }}, OPMVX, SimdMultOp);
                     0x29: vmadd_vx({{
                         Vd_vi[i] = Vs3_vi[i] * Rs1_vi + Vs2_vi[i];
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultAccOp);
                     0x2b: vnmsub_vx({{
                         Vd_vi[i] = -(Vs3_vi[i] * Rs1_vi) + Vs2_vi[i];
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultAccOp);
                     0x2d: vmacc_vx({{
                         Vd_vi[i] = Vs2_vi[i] * Rs1_vi + Vs3_vi[i];
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultAccOp);
                     0x2f: vnmsac_vx({{
                         Vd_vi[i] = -(Vs2_vi[i] * Rs1_vi) + Vs3_vi[i];
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultAccOp);
                 }
                 format VectorIntWideningFormat {
                     0x30: vwaddu_vx({{
                         Vd_vwu[i] = vwu(Vs2_vu[i + offset]) + vwu(Rs1_vu);
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                     0x31: vwadd_vx({{
                         Vd_vwi[i] = vwi(Vs2_vi[i + offset]) + vwi(Rs1_vi);
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                     0x32: vwsubu_vx({{
                         Vd_vwu[i] = vwu(Vs2_vu[i + offset]) - vwu(Rs1_vu);
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                     0x33: vwsub_vx({{
                         Vd_vwi[i] = vwi(Vs2_vi[i + offset]) - vwi(Rs1_vi);
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                     0x34: vwaddu_wx({{
                         Vd_vwu[i] = Vs2_vwu[i] + vwu(Rs1_vu);
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                     0x35: vwadd_wx({{
                         Vd_vwi[i] = Vs2_vwi[i] + vwi(Rs1_vi);
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                     0x36: vwsubu_wx({{
                         Vd_vwu[i] = Vs2_vwu[i] - vwu(Rs1_vu);
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                     0x37: vwsub_wx({{
                         Vd_vwi[i] = Vs2_vwi[i] - vwi(Rs1_vi);
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdAddOp);
                     0x38: vwmulu_vx({{
                         Vd_vwu[i] = vwu(Vs2_vu[i + offset]) * vwu(Rs1_vu);
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultOp);
                     0x3a: vwmulsu_vx({{
                         Vd_vwi[i] = vwi(Vs2_vi[i + offset]) * vwu(Rs1_vu);
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultOp);
                     0x3b: vwmul_vx({{
                         Vd_vwi[i] = vwi(Vs2_vi[i + offset]) * vwi(Rs1_vi);
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultOp);
                     0x3c: vwmaccu_vx({{
                         Vd_vwu[i] = vwu(Rs1_vu) * vwu(Vs2_vu[i + offset])
                                 + Vs3_vwu[i];
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultAccOp);
                     0x3d: vwmacc_vx({{
                         Vd_vwi[i] = vwi(Rs1_vi) * vwi(Vs2_vi[i + offset])
                                 + Vs3_vwi[i];
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultAccOp);
                     0x3e: vwmaccus_vx({{
                         Vd_vwi[i] = vwu(Rs1_vu) * vwi(Vs2_vi[i + offset])
                                 + Vs3_vwi[i];
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultAccOp);
                     0x3f: vwmaccsu_vx({{
                         Vd_vwi[i] = vwi(Rs1_vi) * vwu(Vs2_vu[i + offset])
                                 + Vs3_vwi[i];
-                    }}, OPMVX, VectorIntegerArithOp);
+                    }}, OPMVX, SimdMultAccOp);
                 }
             }
             0x7: decode BIT31 {
@@ -4484,7 +4783,7 @@ decode QUADRANT default Unknown::unknown() {
                         VL = new_vl;
                         Vtype = new_vtype;
                     }}, VSetVlDeclare, VSetVliBranchTarget
-                      , VectorConfigOp, IsUncondControl
+                      , SimdConfigOp, IsUncondControl
                       , IsIndirectControl);
                     0x1: decode BIT30 {
                         0x0: vsetvl({{
@@ -4500,7 +4799,7 @@ decode QUADRANT default Unknown::unknown() {
                             VL = new_vl;
                             Vtype = new_vtype;
                         }}, VSetVlDeclare, VSetVlBranchTarget
-                          , VectorConfigOp, IsUncondControl
+                          , SimdConfigOp, IsUncondControl
                           , IsIndirectControl);
                         0x1: vsetivli({{
                             uint64_t rd_bits = RD;
@@ -4515,7 +4814,7 @@ decode QUADRANT default Unknown::unknown() {
                             VL = new_vl;
                             Vtype = new_vtype;
                         }}, VSetiVliDeclare, VSetiVliBranchTarget
-                          , VectorConfigOp, IsUncondControl
+                          , SimdConfigOp, IsUncondControl
                           , IsDirectControl);
                     }
                 }
@@ -4591,8 +4890,7 @@ decode QUADRANT default Unknown::unknown() {
                         }}, IsSerializeAfter, IsNonSpeculative, IsSyscall,
                             No_OpClass);
                         0x1: ebreak({{
-                            return std::make_shared<BreakpointFault>(
-                                xc->pcState());
+                            return executeEBreakOrSemihosting(xc);
                         }}, IsSerializeAfter, IsNonSpeculative, No_OpClass);
                         0x2: uret({{
                            MISA misa = xc->readMiscReg(MISCREG_ISA);
@@ -4697,27 +4995,33 @@ decode QUADRANT default Unknown::unknown() {
                 0x1: csrrw({{
                     Rd = rvSext(data);
                     data = rvZext(Rs1);
-                }}, IsSerializeAfter, IsNonSpeculative, No_OpClass);
+                }}, 'RD != 0', 'true'
+                  , IsSerializeAfter, IsNonSpeculative, No_OpClass);
                 0x2: csrrs({{
                     Rd = rvSext(data);
                     data = rvZext(data | Rs1);
-                }}, IsSerializeAfter, IsNonSpeculative, No_OpClass);
+                }}, 'true', 'RS1 != 0'
+                  , IsSerializeAfter, IsNonSpeculative, No_OpClass);
                 0x3: csrrc({{
                     Rd = rvSext(data);
                     data = rvZext(data & ~Rs1);
-                }}, IsSerializeAfter, IsNonSpeculative, No_OpClass);
+                }}, 'true', 'RS1 != 0'
+                  , IsSerializeAfter, IsNonSpeculative, No_OpClass);
                 0x5: csrrwi({{
                     Rd = rvSext(data);
                     data = rvZext(uimm);
-                }}, IsSerializeAfter, IsNonSpeculative, No_OpClass);
+                }}, 'RD != 0', 'true'
+                  , IsSerializeAfter, IsNonSpeculative, No_OpClass);
                 0x6: csrrsi({{
                     Rd = rvSext(data);
                     data = rvZext(data | uimm);
-                }}, IsSerializeAfter, IsNonSpeculative, No_OpClass);
+                }}, 'true', 'uimm != 0'
+                  , IsSerializeAfter, IsNonSpeculative, No_OpClass);
                 0x7: csrrci({{
                     Rd = rvSext(data);
                     data = rvZext(data & ~uimm);
-                }}, IsSerializeAfter, IsNonSpeculative, No_OpClass);
+                }}, 'true', 'uimm != 0'
+                  , IsSerializeAfter, IsNonSpeculative, No_OpClass);
             }
         }
 
diff --git a/src/arch/riscv/isa/formats/amo.isa b/src/arch/riscv/isa/formats/amo.isa
index fe497536cc..1c385fb357 100644
--- a/src/arch/riscv/isa/formats/amo.isa
+++ b/src/arch/riscv/isa/formats/amo.isa
@@ -241,9 +241,6 @@ def template LoadReservedExecute {{
         %(op_rd)s;
         %(ea_code)s;
 
-        if (!alignmentOk(xc, EA, sizeof(Mem))) {
-            return std::make_shared<AddressFault>(EA, LOAD_ADDR_MISALIGNED);
-        }
         {
             Fault fault =
                 readMemAtomicLE(xc, traceData, EA, Mem, memAccessFlags);
@@ -271,9 +268,6 @@ def template StoreCondExecute {{
 
         %(memacc_code)s;
 
-        if (!alignmentOk(xc, EA, sizeof(Mem))) {
-            return std::make_shared<AddressFault>(EA, STORE_ADDR_MISALIGNED);
-        }
         {
             Fault fault =
                 writeMemAtomicLE(xc, traceData, Mem, EA, memAccessFlags,
@@ -305,9 +299,6 @@ def template AtomicMemOpRMWExecute {{
 
         assert(amo_op);
 
-        if (!alignmentOk(xc, EA, sizeof(Mem))) {
-            return std::make_shared<AddressFault>(EA, AMO_ADDR_MISALIGNED);
-        }
         {
             Fault fault =
                 amoMemAtomicLE(xc, traceData, Mem, EA, memAccessFlags, amo_op);
@@ -336,9 +327,6 @@ def template LoadReservedInitiateAcc {{
         %(op_rd)s;
         %(ea_code)s;
 
-        if (!alignmentOk(xc, EA, sizeof(Mem))) {
-            return std::make_shared<AddressFault>(EA, LOAD_ADDR_MISALIGNED);
-        }
         return initiateMemRead(xc, traceData, EA, Mem, memAccessFlags);
     }
 }};
@@ -355,9 +343,6 @@ def template StoreCondInitiateAcc {{
         %(ea_code)s;
         %(memacc_code)s;
 
-        if (!alignmentOk(xc, EA, sizeof(Mem))) {
-            return std::make_shared<AddressFault>(EA, STORE_ADDR_MISALIGNED);
-        }
         {
             Fault fault = writeMemTimingLE(xc, traceData, Mem, EA,
                 memAccessFlags, nullptr);
@@ -385,9 +370,6 @@ def template AtomicMemOpRMWInitiateAcc {{
 
         assert(amo_op);
 
-        if (!alignmentOk(xc, EA, sizeof(Mem))) {
-            return std::make_shared<AddressFault>(EA, AMO_ADDR_MISALIGNED);
-        }
         return initiateMemAMO(xc, traceData, EA, Mem, memAccessFlags, amo_op);
     }
 }};
@@ -463,8 +445,12 @@ def format LoadReserved(memacc_code, postacc_code={{ }},
     iop = InstObjParams(name, Name, 'LoadReservedMicro',
         {'ea_code': ea_code, 'memacc_code': memacc_code,
         'postacc_code': postacc_code}, inst_flags)
+    mem_flags = ['Request::%s' % flag for flag in mem_flags]
+    align_flag = getAlignFlag(iop)
+    if align_flag:
+        mem_flags.append(align_flag)
     iop.constructor += '\n\tmemAccessFlags = memAccessFlags | ' + \
-        '|'.join(['Request::%s' % flag for flag in mem_flags]) + ';'
+        '|'.join(mem_flags) + ';'
 
     header_output += LRSCMicroDeclare.subst(iop)
     decoder_output += LRSCMicroConstructor.subst(iop)
@@ -490,8 +476,12 @@ def format StoreCond(memacc_code, postacc_code={{ }},
     iop = InstObjParams(name, Name, 'StoreCondMicro',
         {'ea_code': ea_code, 'memacc_code': memacc_code,
         'postacc_code': postacc_code}, inst_flags)
+    mem_flags = ['Request::%s' % flag for flag in mem_flags]
+    align_flag = getAlignFlag(iop)
+    if align_flag:
+        mem_flags.append(align_flag)
     iop.constructor += '\n\tmemAccessFlags = memAccessFlags | ' + \
-        '|'.join(['Request::%s' % flag for flag in mem_flags]) + ';'
+        '|'.join(mem_flags) + ';'
 
     header_output += LRSCMicroDeclare.subst(iop)
     decoder_output += LRSCMicroConstructor.subst(iop)
@@ -521,8 +511,12 @@ def format AtomicMemOp(memacc_code, amoop_code, postacc_code={{ }},
                              'amoop_code': amoop_code},
                             rmw_inst_flags)
 
+    rmw_mem_flags = ['Request::%s' % flag for flag in rmw_mem_flags]
+    align_flag = getAlignFlag(rmw_iop)
+    if align_flag:
+        rmw_mem_flags.append(align_flag)
     rmw_iop.constructor += '\n\tmemAccessFlags = memAccessFlags | ' + \
-          '|'.join(['Request::%s' % flag for flag in rmw_mem_flags]) + ';'
+          '|'.join(rmw_mem_flags) + ';'
 
     header_output += AtomicMemOpRMWDeclare.subst(rmw_iop)
     decoder_output += AtomicMemOpRMWConstructor.subst(rmw_iop)
diff --git a/src/arch/riscv/isa/formats/compressed.isa b/src/arch/riscv/isa/formats/compressed.isa
index 7a9fd634c8..8d6a125ef1 100644
--- a/src/arch/riscv/isa/formats/compressed.isa
+++ b/src/arch/riscv/isa/formats/compressed.isa
@@ -157,6 +157,36 @@ def template CBasicExecute {{
     }
 }};
 
+def template CJumpExecute {{
+    Fault
+    %(class_name)s::execute(
+        ExecContext *xc, trace::InstRecord *traceData) const
+    {
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+        %(op_wb)s;
+        return NoFault;
+    }
+
+    std::unique_ptr<PCStateBase>
+    %(class_name)s::branchTarget(ThreadContext *tc) const
+    {
+        PCStateBase *pc_ptr = tc->pcState().clone();
+        pc_ptr->as<PCState>().set(rvZext(tc->getReg(srcRegIdx(0)) & ~0x1));
+        return std::unique_ptr<PCStateBase>{pc_ptr};
+    }
+
+    std::string
+    %(class_name)s::generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(srcRegIdx(0));
+        return ss.str();
+    }
+}};
+
 def format CompressedROp(code, *opt_flags) {{
     regs = ['destRegIdx(0)','srcRegIdx(1)']
     iop = InstObjParams(name, Name, 'RegOp',
@@ -166,3 +196,14 @@ def format CompressedROp(code, *opt_flags) {{
     decode_block = BasicDecode.subst(iop)
     exec_output = CBasicExecute.subst(iop)
 }};
+
+def format CJump(code, *opt_flags) {{
+    regs = ['srcRegIdx(0)']
+    iop = InstObjParams(name, Name, 'ImmOp<int64_t>',
+        {'code': code, 'imm_code': 'imm = 0;',
+         'regs': ','.join(regs)}, opt_flags)
+    header_output = JumpDeclare.subst(iop)
+    decoder_output = JumpConstructor.subst(iop)
+    decode_block = BasicDecode.subst(iop)
+    exec_output = CJumpExecute.subst(iop)
+}};
diff --git a/src/arch/riscv/isa/formats/fp.isa b/src/arch/riscv/isa/formats/fp.isa
index d0bd245ae4..d459a59a50 100644
--- a/src/arch/riscv/isa/formats/fp.isa
+++ b/src/arch/riscv/isa/formats/fp.isa
@@ -46,13 +46,10 @@ def template FloatExecute {{
         %(op_decl)s;
         %(op_rd)s;
 
-        RegVal FFLAGS = xc->readMiscReg(MISCREG_FFLAGS);
         std::feclearexcept(FE_ALL_EXCEPT);
         %(code)s;
-
-        FFLAGS |= softfloat_exceptionFlags;
+        xc->setMiscReg(MISCREG_FFLAGS_EXE, softfloat_exceptionFlags);
         softfloat_exceptionFlags = 0;
-        xc->setMiscReg(MISCREG_FFLAGS, FFLAGS);
 
         %(op_wb)s;
 
diff --git a/src/arch/riscv/isa/formats/m5ops.isa b/src/arch/riscv/isa/formats/m5ops.isa
index 034a0dd2b5..ce2092cac3 100644
--- a/src/arch/riscv/isa/formats/m5ops.isa
+++ b/src/arch/riscv/isa/formats/m5ops.isa
@@ -1,5 +1,6 @@
 //
 // Copyright (c) 2020 Barkhausen Institut
+// Copyright (c) 2024 University of Rostock
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -40,10 +41,12 @@ def format M5Op() {{
             uint64_t result;
             if (machInst.rv_type == RV32) {
                 pseudo_inst::pseudoInst<RegABI32>(xc->tcBase(), M5FUNC, result);
+                a0 = bits(result, 31, 0);
+                a1 = bits(result, 63, 32);
             } else {
                 pseudo_inst::pseudoInst<RegABI64>(xc->tcBase(), M5FUNC, result);
-            }
-            a0 = rvSext(result)''',
+                a0 = rvSext(result);
+            }''',
             ['IsNonSpeculative', 'IsSerializeAfter'])
     header_output = BasicDeclare.subst(iop)
     decoder_output = BasicConstructor.subst(iop)
diff --git a/src/arch/riscv/isa/formats/mem.isa b/src/arch/riscv/isa/formats/mem.isa
index 53de4af8b4..e5da80a06c 100644
--- a/src/arch/riscv/isa/formats/mem.isa
+++ b/src/arch/riscv/isa/formats/mem.isa
@@ -63,6 +63,25 @@ def template LoadStoreConstructor {{
 }};
 
 let {{
+def getAlignFlag(iop):
+    align_map = {
+      'uint8_t': 'MMU::ByteAlign',
+      'int8_t': 'MMU::ByteAlign',
+      'uint16_t': 'MMU::HalfWordAlign',
+      'int16_t': 'MMU::HalfWordAlign',
+      'uint32_t': 'MMU::WordAlign',
+      'int32_t': 'MMU::WordAlign',
+      'uint64_t': 'MMU::DoubleWordAlign',
+      'int64_t': 'MMU::DoubleWordAlign',
+    }
+    flag = ''
+    operands = iop.operands
+    if operands.bases.get('Mem'):
+        Mem = operands.bases['Mem']
+        flag = align_map.get(Mem.ctype)
+
+    return flag
+
 def LoadStoreBase(name, Name, offset_code, ea_code, memacc_code, mem_flags,
         inst_flags, base_class, postacc_code='', decode_template=BasicDecode,
         exec_template_base=''):
@@ -75,8 +94,11 @@ def LoadStoreBase(name, Name, offset_code, ea_code, memacc_code, mem_flags,
          'memacc_code': memacc_code, 'postacc_code': postacc_code },
         inst_flags)
 
+    mem_flags = [ 'Request::%s' % flag for flag in mem_flags ]
+    align_flag = getAlignFlag(iop)
+    if align_flag:
+        mem_flags.append(align_flag)
     if mem_flags:
-        mem_flags = [ 'Request::%s' % flag for flag in mem_flags ]
         s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
         iop.constructor += s
 
@@ -106,9 +128,6 @@ def template LoadExecute {{
         %(op_rd)s;
         %(ea_code)s;
 
-        if (!alignmentOk(xc, EA, sizeof(Mem))) {
-            return std::make_shared<AddressFault>(EA, LOAD_ADDR_MISALIGNED);
-        }
         {
             Fault fault =
                 readMemAtomicLE(xc, traceData, EA, Mem, memAccessFlags);
@@ -135,9 +154,6 @@ def template LoadInitiateAcc {{
         %(op_rd)s;
         %(ea_code)s;
 
-        if (!alignmentOk(xc, EA, sizeof(Mem))) {
-            return std::make_shared<AddressFault>(EA, LOAD_ADDR_MISALIGNED);
-        }
         return initiateMemRead(xc, traceData, EA, Mem, memAccessFlags);
     }
 }};
@@ -172,9 +188,6 @@ def template StoreExecute {{
 
         %(memacc_code)s;
 
-        if (!alignmentOk(xc, EA, sizeof(Mem))) {
-            return std::make_shared<AddressFault>(EA, STORE_ADDR_MISALIGNED);
-        }
         {
             Fault fault =
                 writeMemAtomicLE(xc, traceData, Mem, EA, memAccessFlags,
@@ -203,9 +216,6 @@ def template StoreInitiateAcc {{
 
         %(memacc_code)s;
 
-        if (!alignmentOk(xc, EA, sizeof(Mem))) {
-            return std::make_shared<AddressFault>(EA, STORE_ADDR_MISALIGNED);
-        }
         {
             Fault fault = writeMemTimingLE(xc, traceData, Mem, EA,
                 memAccessFlags, nullptr);
diff --git a/src/arch/riscv/isa/formats/standard.isa b/src/arch/riscv/isa/formats/standard.isa
index 051c526377..83b135daf2 100644
--- a/src/arch/riscv/isa/formats/standard.isa
+++ b/src/arch/riscv/isa/formats/standard.isa
@@ -199,7 +199,7 @@ def template BranchExecute {{
     {
         auto &rpc = branch_pc.as<RiscvISA::PCState>();
         std::unique_ptr<PCState> npc(dynamic_cast<PCState*>(rpc.clone()));
-        npc->set(rpc.pc() + imm);
+        npc->set(rvZext(rpc.pc() + imm));
         return npc;
     }
 
@@ -306,7 +306,8 @@ def template JumpExecute {{
     %(class_name)s::branchTarget(ThreadContext *tc) const
     {
         PCStateBase *pc_ptr = tc->pcState().clone();
-        pc_ptr->as<PCState>().set((tc->getReg(srcRegIdx(0)) + imm) & ~0x1);
+        pc_ptr->as<PCState>().set(
+            rvZext((tc->getReg(srcRegIdx(0)) + imm) & ~0x1));
         return std::unique_ptr<PCStateBase>{pc_ptr};
     }
 
@@ -315,12 +316,8 @@ def template JumpExecute {{
             Addr pc, const loader::SymbolTable *symtab) const
     {
         std::stringstream ss;
-        ss << mnemonic << ' ';
-        if (QUADRANT == 0x3)
-            ss << registerName(destRegIdx(0)) << ", "
-               << imm << "(" << registerName(srcRegIdx(0)) << ")";
-        else
-            ss << registerName(srcRegIdx(0));
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+           << imm << "(" << registerName(srcRegIdx(0)) << ")";
         return ss.str();
     }
 }};
@@ -333,7 +330,6 @@ def template CSRExecute {{
         // We assume a riscv instruction is always run with a riscv ISA.
         auto isa = static_cast<RiscvISA::ISA*>(xc->tcBase()->getIsaPtr());
         auto& csr_data = isa->getCSRDataMap();
-        auto& csr_masks = isa->getCSRMaskMap();
         MISA misa = isa->readMiscRegNoEffect(MISCREG_ISA);
 
         auto csr_data_it = csr_data.find(csr);
@@ -360,14 +356,10 @@ def template CSRExecute {{
                     machInst);
         }
 
-        auto mask_it = csr_masks.find(csr);
-        RegVal maskVal = (mask_it == csr_masks.end()) ? mask(64)
-                                                      : mask_it->second;
-
         %(op_decl)s;
         %(op_rd)s;
 
-        RegVal data, olddata;
+        RegVal data = 0;
         auto lowestAllowedMode = (PrivilegeMode)bits(csr, 9, 8);
         auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV);
         if (pm < lowestAllowedMode) {
@@ -389,55 +381,20 @@ def template CSRExecute {{
             break;
         }
 
-        if (csr == CSR_FCSR) {
-            olddata = xc->readMiscReg(MISCREG_FFLAGS) |
-                      (xc->readMiscReg(MISCREG_FRM) << FRM_OFFSET);
-        } else {
-            olddata = xc->readMiscReg(midx);
+        if (%(read_cond)s) {
+            data = rvZext(xc->readMiscReg(midx));
         }
-        olddata = rvZext(olddata);
-        auto olddata_all = olddata;
-
-        olddata &= maskVal;
-        DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, olddata);
-        data = olddata;
 
         %(code)s;
 
-        data &= maskVal;
-        if (data != olddata) {
+        if (%(write_cond)s) {
             if (bits(csr, 11, 10) == 0x3) {
                 return std::make_shared<IllegalInstFault>(
                         csprintf("CSR %s is read-only\n", csrName), machInst);
             }
-            auto newdata_all = data;
-            // We must keep those original bits not in mask.
-            // olddata and data only contain the bits visable
-            // in current privilige level.
-            newdata_all = (olddata_all & ~maskVal) | data;
             DPRINTF(RiscvMisc, "Writing %#x to CSR %s.\n",
-                    newdata_all, csrName);
-            switch (csr) {
-              case CSR_FCSR:
-                xc->setMiscReg(MISCREG_FFLAGS, bits(data, 4, 0));
-                xc->setMiscReg(MISCREG_FRM, bits(data, 7, 5));
-                break;
-              case CSR_MIP: case CSR_MIE:
-              case CSR_SIP: case CSR_SIE:
-              case CSR_UIP: case CSR_UIE:
-              case CSR_MSTATUS: case CSR_SSTATUS: case CSR_USTATUS:
-                if (newdata_all != olddata_all) {
-                    xc->setMiscReg(midx, newdata_all);
-                } else {
-                    return std::make_shared<IllegalInstFault>(
-                            "Only bits in mask are allowed to be set\n",
-                            machInst);
-                }
-                break;
-              default:
-                xc->setMiscReg(midx, data);
-                break;
-            }
+                    data, csrName);
+            xc->setMiscReg(midx, data);
         }
         %(op_wb)s;
         return NoFault;
@@ -542,8 +499,10 @@ def format SystemOp(code, *opt_flags) {{
     exec_output = BasicExecute.subst(iop)
 }};
 
-def format CSROp(code, *opt_flags) {{
-    iop = InstObjParams(name, Name, 'CSROp', code, opt_flags)
+def format CSROp(code, read_cond, write_cond, *opt_flags) {{
+    iop = InstObjParams(name, Name, 'CSROp',
+        {'code': code, 'read_cond': read_cond,
+         'write_cond': write_cond}, opt_flags)
     header_output = BasicDeclare.subst(iop)
     decoder_output = BasicConstructor.subst(iop)
     decode_block = BasicDecode.subst(iop)
diff --git a/src/arch/riscv/isa/formats/vector_arith.isa b/src/arch/riscv/isa/formats/vector_arith.isa
index 1ddf323f04..3b7a57e208 100644
--- a/src/arch/riscv/isa/formats/vector_arith.isa
+++ b/src/arch/riscv/isa/formats/vector_arith.isa
@@ -114,12 +114,10 @@ let {{
 
     def fflags_wrapper(code):
         return '''
-        RegVal FFLAGS = xc->readMiscReg(MISCREG_FFLAGS);
         std::feclearexcept(FE_ALL_EXCEPT);
         ''' + code + '''
-        FFLAGS |= softfloat_exceptionFlags;
+        xc->setMiscReg(MISCREG_FFLAGS_EXE, softfloat_exceptionFlags);
         softfloat_exceptionFlags = 0;
-        xc->setMiscReg(MISCREG_FFLAGS, FFLAGS);
         '''
 
     def declareVArithTemplate(
@@ -211,6 +209,7 @@ def format VectorIntFormat(code, category, *flags) {{
         vm_decl_rd = vmDeclAndReadData()
 
     set_vlenb = setVlenb()
+    set_vlen = setVlen() if need_elem_idx else ""
 
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
@@ -219,6 +218,7 @@ def format VectorIntFormat(code, category, *flags) {{
          'set_dest_reg_idx': set_dest_reg_idx,
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb' : set_vlenb,
+         'set_vlen' : set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'copy_old_vd': copyOldVd(old_vd_idx),
          'declare_varith_template': declareVArithTemplate(Name + "Micro")},
@@ -486,6 +486,7 @@ def format VectorIntMaskFormat(code, category, *flags) {{
         vm_decl_rd = vmDeclAndReadData()
 
     set_vlenb = setVlenb()
+    set_vlen = setVlen() if need_elem_idx else ""
 
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
@@ -494,6 +495,7 @@ def format VectorIntMaskFormat(code, category, *flags) {{
          'set_dest_reg_idx': set_dest_reg_idx,
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb': set_vlenb,
+         'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'copy_old_vd': copyOldVd(old_vd_idx),
          'declare_varith_template': declareVArithTemplate(Name + "Micro")},
@@ -582,7 +584,7 @@ def format VectorFloatFormat(code, category, *flags) {{
         Name,
         'VectorArithMacroInst',
         {'code': code,
-         'declare_varith_template': declareVArithTemplate(Name, 'float', 32)},
+         'declare_varith_template': declareVArithTemplate(Name, 'float', 16)},
         flags
     )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
@@ -622,8 +624,9 @@ def format VectorFloatFormat(code, category, *flags) {{
         vm_decl_rd = vmDeclAndReadData()
 
     set_vlenb = setVlenb();
+    set_vlen = setVlen() if need_elem_idx else ""
 
-    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 32)
+    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 16)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -631,6 +634,7 @@ def format VectorFloatFormat(code, category, *flags) {{
          'set_dest_reg_idx': set_dest_reg_idx,
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb': set_vlenb,
+         'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'copy_old_vd': copyOldVd(2),
          'declare_varith_template': varith_micro_declare},
@@ -652,7 +656,7 @@ def format VectorFloatCvtFormat(code, category, *flags) {{
         Name,
         'VectorArithMacroInst',
         {'code': code,
-         'declare_varith_template': declareVArithTemplate(Name, 'float', 32)},
+         'declare_varith_template': declareVArithTemplate(Name, 'float', 16)},
         flags
     )
 
@@ -675,8 +679,9 @@ def format VectorFloatCvtFormat(code, category, *flags) {{
     vm_decl_rd = vmDeclAndReadData()
 
     set_vlenb = setVlenb();
+    set_vlen = setVlen()
 
-    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 32)
+    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 16)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -684,6 +689,7 @@ def format VectorFloatCvtFormat(code, category, *flags) {{
          'set_dest_reg_idx': set_dest_reg_idx,
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb': set_vlenb,
+         'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'copy_old_vd': copyOldVd(old_vd_idx),
          'declare_varith_template': varith_micro_declare},
@@ -700,7 +706,7 @@ def format VectorFloatCvtFormat(code, category, *flags) {{
 }};
 
 def format VectorFloatWideningFormat(code, category, *flags) {{
-    varith_macro_declare = declareVArithTemplate(Name, 'float', 32, 32)
+    varith_macro_declare = declareVArithTemplate(Name, 'float', 16, 32)
     iop = InstObjParams(
         name,
         Name,
@@ -756,7 +762,7 @@ def format VectorFloatWideningFormat(code, category, *flags) {{
     set_vlen = setVlen();
 
     varith_micro_declare = declareVArithTemplate(
-        Name + "Micro", 'float', 32, 32)
+        Name + "Micro", 'float', 16, 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -781,7 +787,7 @@ def format VectorFloatWideningFormat(code, category, *flags) {{
 }};
 
 def format VectorFloatWideningCvtFormat(code, category, *flags) {{
-    varith_macro_declare = declareVArithTemplate(Name, 'float', 32, 32)
+    varith_macro_declare = declareVArithTemplate(Name, 'float', 8, 32)
     iop = InstObjParams(
         name,
         Name,
@@ -803,7 +809,7 @@ def format VectorFloatWideningCvtFormat(code, category, *flags) {{
     set_src_reg_idx += setSrcWrapper(src3_reg_id)
     set_src_reg_idx += setSrcVm()
     code = maskCondWrapper(code)
-    code = eiDeclarePrefix(code)
+    code = eiDeclarePrefix(code, widening=True)
     code = loopWrapper(code)
     code = fflags_wrapper(code)
 
@@ -813,7 +819,7 @@ def format VectorFloatWideningCvtFormat(code, category, *flags) {{
     set_vlen = setVlen();
 
     varith_micro_declare = declareVArithTemplate(
-        Name + "Micro", 'float', 32, 32)
+        Name + "Micro", 'float', 8, 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -834,11 +840,11 @@ def format VectorFloatWideningCvtFormat(code, category, *flags) {{
         VectorFloatMicroConstructor.subst(microiop) + \
         VectorIntWideningMacroConstructor.subst(iop)
     exec_output = VectorFloatWideningMicroExecute.subst(microiop)
-    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+    decode_block = VectorFloatWideningAndNarrowingCvtDecodeBlock.subst(iop)
 }};
 
 def format VectorFloatNarrowingCvtFormat(code, category, *flags) {{
-    varith_macro_declare = declareVArithTemplate(Name, 'float', 32, 32)
+    varith_macro_declare = declareVArithTemplate(Name, 'float', 8, 32)
     iop = InstObjParams(
         name,
         Name,
@@ -860,7 +866,7 @@ def format VectorFloatNarrowingCvtFormat(code, category, *flags) {{
     set_src_reg_idx += setSrcWrapper(src3_reg_id)
     set_src_reg_idx += setSrcVm()
     code = maskCondWrapper(code)
-    code = eiDeclarePrefix(code)
+    code = eiDeclarePrefix(code, widening=True)
     code = loopWrapper(code)
     code = fflags_wrapper(code)
     code = narrowingOpRegisterConstraintChecks(code)
@@ -871,7 +877,7 @@ def format VectorFloatNarrowingCvtFormat(code, category, *flags) {{
     set_vlen = setVlen();
 
     varith_micro_declare = declareVArithTemplate(
-        Name + "Micro", 'float', 32, 32)
+        Name + "Micro", 'float', 8, 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -892,7 +898,7 @@ def format VectorFloatNarrowingCvtFormat(code, category, *flags) {{
         VectorFloatMicroConstructor.subst(microiop) + \
         VectorIntWideningMacroConstructor.subst(iop)
     exec_output = VectorFloatNarrowingMicroExecute.subst(microiop)
-    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+    decode_block = VectorFloatWideningAndNarrowingCvtDecodeBlock.subst(iop)
 }};
 
 def format VectorFloatMaskFormat(code, category, *flags) {{
@@ -900,7 +906,7 @@ def format VectorFloatMaskFormat(code, category, *flags) {{
         Name,
         'VectorArithMacroInst',
         {'code': code,
-         'declare_varith_template': declareVArithTemplate(Name, 'float', 32)},
+         'declare_varith_template': declareVArithTemplate(Name, 'float', 16)},
         flags
     )
     dest_reg_id = "vecRegClass[VecMemInternalReg0 + _microIdx]"
@@ -921,13 +927,14 @@ def format VectorFloatMaskFormat(code, category, *flags) {{
     set_src_reg_idx += setSrcVm()
     vm_decl_rd = vmDeclAndReadData()
     set_vlenb = setVlenb()
+    set_vlen = setVlen()
 
     code = maskCondWrapper(code)
     code = eiDeclarePrefix(code)
     code = loopWrapper(code)
     code = fflags_wrapper(code)
 
-    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 32)
+    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 16)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -935,6 +942,7 @@ def format VectorFloatMaskFormat(code, category, *flags) {{
          'set_dest_reg_idx': set_dest_reg_idx,
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb': set_vlenb,
+         'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'copy_old_vd': copyOldVd(2),
          'declare_varith_template': varith_micro_declare},
@@ -982,7 +990,7 @@ def format ViotaFormat(code, category, *flags){{
 
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
-    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2]"
     # The tail of vector mask inst should be treated as tail-agnostic.
     # We treat it with tail-undisturbed policy, since
     # the test suits only support undisturbed policy.
@@ -991,11 +999,15 @@ def format ViotaFormat(code, category, *flags){{
     set_src_reg_idx = ""
     set_src_reg_idx += setSrcWrapper(src2_reg_id)
     set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
     set_dest_reg_idx = setDestWrapper(dest_reg_id)
     vm_decl_rd = vmDeclAndReadData()
     set_vm_idx = setSrcVm()
     set_vlenb = setVlenb()
 
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+
     microiop = InstObjParams(name+"_micro",
         Name+"Micro",
         'VectorArithMicroInst',
@@ -1049,7 +1061,7 @@ def format Vector1Vs1VdMaskFormat(code, category, *flags){{
          },
         flags)
 
-    header_output = Vector1Vs1RdMaskDeclare.subst(iop)
+    header_output = Vector1Vs1VdMaskDeclare.subst(iop)
     decoder_output = Vector1Vs1VdMaskConstructor.subst(iop)
     exec_output = Vector1Vs1VdMaskExecute.subst(iop)
     decode_block = VectorMaskDecodeBlock.subst(iop)
@@ -1085,7 +1097,7 @@ def format VectorNonSplitFormat(code, category, *flags) {{
         code = fflags_wrapper(code)
 
     if inst_name == "vfmv" :
-        varith_template = declareVArithTemplate(Name, 'float', 32)
+        varith_template = declareVArithTemplate(Name, 'float', 16)
         iop = InstObjParams(name,
             Name,
             'VectorNonSplitInst',
@@ -1219,7 +1231,7 @@ def format VectorReduceFloatFormat(code, category, *flags) {{
         Name,
         'VectorArithMacroInst',
         {'code': code,
-         'declare_varith_template': declareVArithTemplate(Name, 'float', 32)},
+         'declare_varith_template': declareVArithTemplate(Name, 'float', 16)},
         flags
     )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
@@ -1245,7 +1257,7 @@ def format VectorReduceFloatFormat(code, category, *flags) {{
 
     code = fflags_wrapper(code)
 
-    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 32)
+    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 16)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -1271,7 +1283,7 @@ def format VectorReduceFloatFormat(code, category, *flags) {{
 }};
 
 def format VectorReduceFloatWideningFormat(code, category, *flags) {{
-    varith_macro_declare = declareVArithTemplate(Name, 'float', 32, 32)
+    varith_macro_declare = declareVArithTemplate(Name, 'float', 16, 32)
     iop = InstObjParams(
         name,
         Name,
@@ -1303,7 +1315,7 @@ def format VectorReduceFloatWideningFormat(code, category, *flags) {{
     '''
 
     varith_micro_declare = declareVArithTemplate(
-        Name + "Micro", 'float', 32, 32)
+        Name + "Micro", 'float', 16, 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -1362,6 +1374,7 @@ def format VectorIntVxsatFormat(code, category, *flags) {{
     vm_decl_rd = vmDeclAndReadData()
 
     set_vlenb = setVlenb()
+    set_vlen = setVlen()
 
     code = maskCondWrapper(code)
     code = eiDeclarePrefix(code)
@@ -1374,6 +1387,7 @@ def format VectorIntVxsatFormat(code, category, *flags) {{
          'set_dest_reg_idx': set_dest_reg_idx,
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb': set_vlenb,
+         'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'copy_old_vd': copyOldVd(old_vd_idx),
          'declare_varith_template': declareVArithTemplate(Name + "Micro")},
@@ -1450,7 +1464,7 @@ def VectorSlideBase(name, Name, category, code, flags, macro_construtor,
     if decode_template is VectorIntDecodeBlock:
         varith_macro_declare = declareVArithTemplate(Name)
     elif decode_template is VectorFloatDecodeBlock:
-        varith_macro_declare = declareVArithTemplate(Name, 'float', 32)
+        varith_macro_declare = declareVArithTemplate(Name, 'float', 16)
 
     iop = InstObjParams(
         name,
@@ -1493,7 +1507,7 @@ def VectorSlideBase(name, Name, category, code, flags, macro_construtor,
         varith_micro_declare = declareVArithTemplate(Name + "Micro")
     elif decode_template is VectorFloatDecodeBlock:
         varith_micro_declare = declareVArithTemplate(
-            Name + "Micro", 'float', 32)
+            Name + "Micro", 'float', 16)
 
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
diff --git a/src/arch/riscv/isa/formats/vector_mem.isa b/src/arch/riscv/isa/formats/vector_mem.isa
index 3b3309797c..560673f4cb 100644
--- a/src/arch/riscv/isa/formats/vector_mem.isa
+++ b/src/arch/riscv/isa/formats/vector_mem.isa
@@ -42,6 +42,19 @@ def declareVMemTemplate(class_name):
     template class {class_name}<uint64_t>;
     '''
 
+def getFaultCode():
+    return '''
+    Addr fault_addr;
+    if (fault != NoFault && getFaultVAddr(fault, fault_addr)) {
+        assert(fault_addr >= EA);
+        faultIdx = (fault_addr - EA) / (width_EEW(machInst.width) / 8);
+        if (microIdx != 0 || faultIdx != 0) {
+            fault = NoFault;
+            trimVl = true;
+        }
+    }
+    '''
+
 def VMemBase(name, Name, ea_code, memacc_code, mem_flags,
                    inst_flags, base_class, postacc_code='',
                    declare_template_base=VMemMacroDeclare,
@@ -69,6 +82,9 @@ def VMemBase(name, Name, ea_code, memacc_code, mem_flags,
         return (header_output, decoder_output, decode_block, exec_output)
 
     micro_class_name = exec_template_base + 'MicroInst'
+
+    fault_only_first = 'FaultOnlyFirst' in iop.op_class
+
     microiop = InstObjParams(name + '_micro',
         Name + 'Micro',
         exec_template_base + 'MicroInst',
@@ -77,7 +93,8 @@ def VMemBase(name, Name, ea_code, memacc_code, mem_flags,
          'postacc_code': postacc_code,
          'set_vlenb': setVlenb(),
          'set_vlen': setVlen(),
-         'declare_vmem_template': declareVMemTemplate(Name + 'Micro')},
+         'declare_vmem_template': declareVMemTemplate(Name + 'Micro'),
+         'fault_code': getFaultCode() if fault_only_first else ''},
         inst_flags)
 
     if mem_flags:
@@ -228,3 +245,29 @@ def format VsIndexOp(
                  decode_template=VMemSplitTemplateDecodeBlock
                  )
 }};
+
+def format VlSegOp(
+    memacc_code,
+    ea_code={{
+        EA = Rs1 + mem_size * (microIdx + (field * numMicroops));
+    }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VlSegMacroInst', exec_template_base='VlSeg')
+}};
+
+def format VsSegOp(
+    memacc_code,
+    ea_code={{
+        EA = Rs1 + mem_size * (microIdx + (field * numMicroops));
+    }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VsSegMacroInst', exec_template_base='VsSeg')
+}};
diff --git a/src/arch/riscv/isa/operands.isa b/src/arch/riscv/isa/operands.isa
index 3a16e0994c..de36d902b1 100644
--- a/src/arch/riscv/isa/operands.isa
+++ b/src/arch/riscv/isa/operands.isa
@@ -3,6 +3,7 @@
 // Copyright (c) 2015 RISC-V Foundation
 // Copyright (c) 2016 The University of Virginia
 // Copyright (c) 2020 Barkhausen Institut
+// Copyright (c) 2024 University of Rostock
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -71,6 +72,7 @@ def operands {{
     'sp': IntReg('ud', 'StackPointerReg', 'IsInteger', 2),
 
     'a0': IntReg('ud', '10', 'IsInteger', 1),
+    'a1': IntReg('ud', '11', 'IsInteger', 2),
 
     'Fd': FloatRegOp('df', 'FD', 'IsFloating', 1),
     'Fd_bits': FloatRegOp('ud', 'FD', 'IsFloating', 1),
diff --git a/src/arch/riscv/isa/templates/vector_arith.isa b/src/arch/riscv/isa/templates/vector_arith.isa
index bce4e2c55e..d000113a89 100644
--- a/src/arch/riscv/isa/templates/vector_arith.isa
+++ b/src/arch/riscv/isa/templates/vector_arith.isa
@@ -179,6 +179,7 @@ Fault
     %(op_decl)s;
     %(op_rd)s;
     %(set_vlenb)s;
+    %(set_vlen)s;
     %(vm_decl_rd)s;
     %(copy_old_vd)s;
     %(code)s;
@@ -650,6 +651,7 @@ Fault
     %(op_decl)s;
     %(op_rd)s;
     %(set_vlenb)s;
+    %(set_vlen)s;
     %(vm_decl_rd)s;
     %(copy_old_vd)s;
     %(code)s;
@@ -665,6 +667,7 @@ Fault
 def template VectorFloatDecodeBlock {{
 
 switch(machInst.vtype8.vsew) {
+case 0b001: return new %(class_name)s<float16_t>(machInst, vlen);
 case 0b010: return new %(class_name)s<float32_t>(machInst, vlen);
 case 0b011: return new %(class_name)s<float64_t>(machInst, vlen);
 default: GEM5_UNREACHABLE;
@@ -821,6 +824,19 @@ Fault
 def template VectorFloatWideningDecodeBlock {{
 
 switch(machInst.vtype8.vsew) {
+case 0b001: return new %(class_name)s<float16_t>(machInst, vlen);
+case 0b010: return new %(class_name)s<float32_t>(machInst, vlen);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+
+def template VectorFloatWideningAndNarrowingCvtDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b000: return new %(class_name)s<float8_t>(machInst, vlen);
+case 0b001: return new %(class_name)s<float16_t>(machInst, vlen);
 case 0b010: return new %(class_name)s<float32_t>(machInst, vlen);
 default: GEM5_UNREACHABLE;
 }
@@ -836,7 +852,15 @@ private:
     %(reg_idx_arr_decl)s;
 public:
     %(class_name)s(ExtMachInst _machInst, uint32_t _vlen);
-    using %(base_class)s::generateDisassembly;
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
 };
 
 }};
@@ -880,7 +904,7 @@ template<typename ElemType>
 class %(class_name)s : public %(base_class)s
 {
 private:
-    RegId srcRegIdxArr[4];
+    RegId srcRegIdxArr[3];
     RegId destRegIdxArr[1];
     bool vm;
     int* cnt;
@@ -888,7 +912,15 @@ public:
     %(class_name)s(ExtMachInst _machInst, uint32_t _microVl,
                    uint32_t _microIdx, int* cnt);
     Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
-    using %(base_class)s::generateDisassembly;
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
 };
 
 }};
@@ -908,7 +940,6 @@ template<typename ElemType>
     _numDestRegs = 0;
     %(set_dest_reg_idx)s;
     %(set_src_reg_idx)s;
-    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2]);
 }
 
 %(declare_varith_template)s;
@@ -951,6 +982,21 @@ Fault
 
 }};
 
+def template Vector1Vs1VdMaskDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
 
 def template Vector1Vs1VdMaskConstructor {{
 
@@ -1097,11 +1143,7 @@ template<typename ElemType>
     int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
     StaticInstPtr microop;
 
-    if (micro_vl == 0) {
-        microop = new VectorNopMicroInst(_machInst);
-        this->microops.push_back(microop);
-    }
-    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+    for (int i = 0; i < num_microops && micro_vl >= 0; ++i) {
         microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
         microop->setDelayedCommit();
         this->microops.push_back(microop);
@@ -1184,6 +1226,7 @@ Fault
     %(op_decl)s;
     %(op_rd)s;
     %(set_vlenb)s;
+    %(set_vlen)s;
     %(vm_decl_rd)s;
     %(copy_old_vd)s;
 
@@ -1226,11 +1269,7 @@ template<typename ElemType>
     int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
     StaticInstPtr microop;
 
-    if (micro_vl == 0) {
-        microop = new VectorNopMicroInst(_machInst);
-        this->microops.push_back(microop);
-    }
-    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+    for (int i = 0; i < num_microops && micro_vl >= 0; ++i) {
         microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
         microop->setDelayedCommit();
         this->microops.push_back(microop);
@@ -1312,6 +1351,7 @@ Fault
     %(op_decl)s;
     %(op_rd)s;
     %(set_vlenb)s;
+    %(set_vlen)s;
     %(vm_decl_rd)s;
     %(copy_old_vd)s;
 
@@ -1598,6 +1638,7 @@ Fault
 def template VectorFloatNonSplitDecodeBlock {{
 
 switch(machInst.vtype8.vsew) {
+case 0b001: return new %(class_name)s<float16_t>(machInst);
 case 0b010: return new %(class_name)s<float32_t>(machInst);
 case 0b011: return new %(class_name)s<float64_t>(machInst);
 default: GEM5_UNREACHABLE;
diff --git a/src/arch/riscv/isa/templates/vector_mem.isa b/src/arch/riscv/isa/templates/vector_mem.isa
index 1510c106c7..77418664dd 100644
--- a/src/arch/riscv/isa/templates/vector_mem.isa
+++ b/src/arch/riscv/isa/templates/vector_mem.isa
@@ -82,6 +82,12 @@ def template VleConstructor {{
         micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
     }
 
+    if (_opClass == SimdUnitStrideFaultOnlyFirstLoadOp) {
+        microop = new VlFFTrimVlMicroOp(_machInst, this->vl, num_microops,
+                                        vlen, microops);
+        this->microops.push_back(microop);
+    }
+
     this->microops.front()->setFirstMicroop();
     this->microops.back()->setLastMicroop();
 }
@@ -168,15 +174,17 @@ Fault
     const std::vector<bool> byte_enable(mem_size, true);
     Fault fault = xc->readMem(EA, Mem.as<uint8_t>(), mem_size, memAccessFlags,
                               byte_enable);
+
+    %(fault_code)s;
+
+    const size_t micro_vlmax = vlen / width_EEW(machInst.width);
+
     if (fault != NoFault)
         return fault;
 
-    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, vlen, true);
-    const size_t micro_elems = vlen / width_EEW(machInst.width);
-
     size_t ei;
 
-    for (size_t i = 0; i < micro_elems; i++) {
+    for (size_t i = 0; i < micro_vlmax; i++) {
         ei = i + micro_vlmax * microIdx;
         %(memacc_code)s;
     }
@@ -215,6 +223,9 @@ Fault
     const std::vector<bool> byte_enable(mem_size, true);
     Fault fault = initiateMemRead(xc, EA, mem_size, memAccessFlags,
                                   byte_enable);
+
+    %(fault_code)s;
+
     return fault;
 }
 
@@ -241,13 +252,14 @@ Fault
         v0 = tmp_v0.as<uint8_t>();
     }
 
-    memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
+    if (xc->readMemAccPredicate()) {
+        memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
+    }
 
-    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, vlen, true);
-    const size_t micro_elems = vlen / width_EEW(machInst.width);
+    const size_t micro_vlmax = vlen / width_EEW(machInst.width);
 
     size_t ei;
-    for (size_t i = 0; i < micro_elems; i++) {
+    for (size_t i = 0; i < micro_vlmax; i++) {
         ei = i + micro_vlmax * microIdx;
         %(memacc_code)s;
     }
@@ -360,7 +372,7 @@ Fault
     %(set_vlen)s;
     %(ea_code)s;
 
-    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, vlen, true);
+    const size_t micro_vlmax = vlen / width_EEW(machInst.width);
     const size_t eewb = width_EEW(machInst.width) / 8;
     const size_t mem_size = eewb * microVl;
     std::vector<bool> byte_enable(mem_size, false);
@@ -411,7 +423,7 @@ Fault
     %(set_vlen)s;
     %(ea_code)s;
 
-    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, vlen, true);
+    const size_t micro_vlmax = vlen / width_EEW(machInst.width);
     const size_t eewb = width_EEW(machInst.width) / 8;
     const size_t mem_size = eewb * microVl;
     std::vector<bool> byte_enable(mem_size, false);
@@ -452,8 +464,7 @@ def template VlmConstructor {{
     %(set_reg_idx_arr)s;
     %(constructor)s;
 
-    const uint32_t micro_vlmax = vlen / width_EEW(_machInst.width);
-    int32_t micro_vl = (std::min(this->vl, micro_vlmax) + 7) / 8;
+    int32_t micro_vl = (this->vl + 7) / 8;
     StaticInstPtr microop;
 
     if (micro_vl == 0) {
@@ -479,8 +490,7 @@ def template VsmConstructor {{
     %(set_reg_idx_arr)s;
     %(constructor)s;
 
-    const uint32_t micro_vlmax = vlen / width_EEW(_machInst.width);
-    int32_t micro_vl = (std::min(this->vl, micro_vlmax) + 7) / 8;
+    int32_t micro_vl = (this->vl + 7) / 8;
 
     StaticInstPtr microop;
     if (micro_vl == 0) {
@@ -1626,6 +1636,467 @@ Fault
 
 }};
 
+def template VlSegConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst, uint32_t _vlen)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _vlen)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const int32_t micro_vlmax = vlen / width_EEW(_machInst.width);
+    const uint32_t num_microops = ceil((float) this->vl / (micro_vlmax));
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+    size_t NFIELDS = machInst.nf + 1;
+    StaticInstPtr microop;
+    uint32_t size_per_elem = width_EEW(_machInst.width) / 8;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    } else {
+        for (int f = 0; f < NFIELDS; ++f) {
+            remaining_vl = this->vl;
+            micro_vl = std::min(remaining_vl, micro_vlmax);
+            for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+                microop = new %(class_name)sMicro(_machInst, micro_vl, i, num_microops, f, NFIELDS, vlen);
+                microop->setDelayedCommit();
+                microop->setFlag(IsLoad);
+                this->microops.push_back(microop);
+                micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
+            }
+        }
+        for (int f = 0; f < NFIELDS; ++f) {
+            remaining_vl = this->vl;
+            micro_vl = std::min(remaining_vl, micro_vlmax);
+            for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+                microop = new VlSegDeIntrlvMicroInst(_machInst, micro_vl, _machInst.vd + i + (f * num_microops),
+                    NFIELDS, i, num_microops, f, vlen, size_per_elem);
+                this->microops.push_back(microop);
+                micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
+            }
+        }
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+    this->flags[IsVector] = true;
+}
+}};
+
+def template VlSegMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // rs1, rs2, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    uint32_t field;
+    uint32_t numFields;
+    uint32_t numMicroops;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint32_t _microVl, uint32_t _microIdx, uint32_t _numMicroops, uint32_t _field, uint32_t _numFields, uint32_t _vlen);
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VlSegMicroConstructor {{
+
+    %(class_name)s::%(class_name)s(ExtMachInst _machInst, uint32_t _microVl, uint32_t _microIdx, uint32_t _numMicroops, uint32_t _field, uint32_t _numFields, uint32_t _vlen)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _microVl, _microIdx , _numMicroops, _field, _numFields, _vlen)
+{
+    %(set_reg_idx_arr)s;
+
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    field = _field;
+    numFields = _numFields;
+    numMicroops = _numMicroops;
+    setDestRegIdx(_numDestRegs++, vecRegClass[VecMemInternalReg0 + _microIdx +
+        (field * numMicroops)]);
+    _numTypedDestRegs[VecRegClass]++;
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + _microIdx +
+        (field * numMicroops)]);
+    if (!_machInst.vm) {
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+    }
+}
+
+}};
+
+def template VlSegMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Addr EA;
+    uint32_t mem_size = width_EEW(machInst.width) / 8 * microVl;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(set_vlen)s;
+    %(ea_code)s;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    MISA misa = xc->readMiscReg(MISCREG_ISA);
+    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+
+    if (!misa.rvv || status.vs == VPUStatus::OFF) {
+        return std::make_shared<IllegalInstFault>(
+            "RVV is disabled or VPU is off", machInst);
+    }
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+
+    panic_if((pow(2, vlmul) * this->numFields) > 8, "LMUL value is illegal for vlseg inst");
+
+    status.vs = VPUStatus::DIRTY;
+    xc->setMiscReg(MISCREG_STATUS, status);
+
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    const std::vector<bool> byte_enable(mem_size, true);
+    Fault fault = xc->readMem(EA, Mem.as<uint8_t>(), mem_size, memAccessFlags,
+                              byte_enable);
+
+    if (fault != NoFault)
+        return fault;
+
+    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, vlen, true);
+    const size_t micro_elems = vlen / width_EEW(machInst.width);
+
+    size_t ei;
+
+    for (size_t i = 0; i < micro_elems; i++) {
+        ei = i + micro_vlmax * microIdx;
+        %(memacc_code)s;
+    }
+
+    %(op_wb)s;
+    return fault;
+}
+
+}};
+
+def template VlSegMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+
+    Addr EA;
+    uint32_t mem_size = width_EEW(this->machInst.width) / 8 * this->microVl;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    MISA misa = xc->readMiscReg(MISCREG_ISA);
+    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+    if (!misa.rvv || status.vs == VPUStatus::OFF) {
+        return std::make_shared<IllegalInstFault>(
+            "RVV is disabled or VPU is off", machInst);
+    }
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+
+    panic_if((pow(2, vlmul) * this->numFields) > 8, "LMUL value is illegal for vlseg inst");
+
+    const std::vector<bool> byte_enable(mem_size, true);
+    Fault fault = initiateMemRead(xc, EA, mem_size, memAccessFlags,
+                                  byte_enable);
+    return fault;
+}
+
+}};
+
+def template VlSegMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+                            trace::InstRecord *traceData) const
+{
+    %(op_decl)s;
+    %(op_rd)s;
+    %(set_vlen)s;
+
+    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+    status.vs = VPUStatus::DIRTY;
+    xc->setMiscReg(MISCREG_STATUS, status);
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
+
+    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, vlen, true);
+    const size_t micro_elems = vlen / width_EEW(machInst.width);
+
+    size_t ei;
+    for (size_t i = 0; i < micro_elems; i++) {
+        ei = i + micro_vlmax * microIdx;
+        %(memacc_code)s;
+    }
+
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VsSegConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst, uint32_t _vlen)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _vlen)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const int32_t micro_vlmax = vlen / width_EEW(_machInst.width);
+    const uint32_t num_microops = ceil((float) this->vl / (micro_vlmax));
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+    size_t NFIELDS = machInst.nf + 1;
+    StaticInstPtr microop;
+    uint32_t size_per_elem = width_EEW(_machInst.width) / 8;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    } else {
+        for (int f = 0; f < NFIELDS; ++f) {
+            remaining_vl = this->vl;
+            micro_vl = std::min(remaining_vl, micro_vlmax);
+            for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+                microop = new VsSegIntrlvMicroInst(_machInst, micro_vl,
+                    _machInst.vs3, NFIELDS, i, num_microops, f, vlen,
+                    size_per_elem);
+                this->microops.push_back(microop);
+                micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
+            }
+        }
+        for (int f = 0; f < NFIELDS; ++f) {
+            remaining_vl = this->vl;
+            micro_vl = std::min(remaining_vl, micro_vlmax);
+            for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+                microop = new %(class_name)sMicro(_machInst, micro_vl, i,
+                    num_microops, f, NFIELDS, vlen);
+                microop->setDelayedCommit();
+                microop->setFlag(IsStore);
+                this->microops.push_back(microop);
+                micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
+            }
+        }
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+    this->flags[IsVector] = true;
+}
+}};
+
+def template VsSegMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // rs1, rs2, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    uint32_t field;
+    uint32_t numFields;
+    uint32_t numMicroops;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint32_t _microVl,
+    uint32_t _microIdx, uint32_t _numMicroops, uint32_t _field,
+    uint32_t _numFields, uint32_t _vlen);
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VsSegMicroConstructor {{
+
+    %(class_name)s::%(class_name)s(ExtMachInst _machInst, uint32_t _microVl,
+    uint32_t _microIdx, uint32_t _numMicroops, uint32_t _field,
+    uint32_t _numFields, uint32_t _vlen)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _microVl,
+    _microIdx, _numMicroops, _field, _numFields, _vlen)
+{
+    %(set_reg_idx_arr)s;
+
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    field = _field;
+    numFields = _numFields;
+    numMicroops = _numMicroops;
+
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + _microIdx +
+            (field * numMicroops)]);
+
+    if (!_machInst.vm) {
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+    }
+    this->flags[IsVector] = true;
+    this->flags[IsStore] = true;
+}
+
+}};
+
+def template VsSegMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Addr EA;
+
+    const size_t eewb = width_EEW(machInst.width) / 8;
+    const size_t mem_size = eewb * microVl;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    MISA misa = xc->readMiscReg(MISCREG_ISA);
+    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+    if (!misa.rvv || status.vs == VPUStatus::OFF) {
+        return std::make_shared<IllegalInstFault>(
+            "RVV is disabled or VPU is off", machInst);
+    }
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(set_vlen)s;
+    %(ea_code)s;
+
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    panic_if((pow(2, vlmul) * this->numFields) > 8,
+        "LMUL value is illegal for vsseg inst");
+
+    const size_t micro_vlmax = vlen / width_EEW(machInst.width);
+
+    std::vector<bool> byte_enable(mem_size, false);
+    size_t ei;
+    for (size_t i = 0; i < microVl; i++) {
+        ei = i + micro_vlmax * microIdx;
+        if (machInst.vm || elem_mask_vseg(v0, ei + (field * microVl),
+            this->numFields)) {
+            %(memacc_code)s;
+            auto it = byte_enable.begin() + i * eewb;
+            std::fill(it, it + eewb, true);
+        }
+    }
+
+    Fault fault;
+    fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA, memAccessFlags,
+                         nullptr, byte_enable);
+    return fault;
+}
+
+}};
+
+def template VsSegMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+
+    Addr EA;
+
+    const size_t eewb = width_EEW(machInst.width) / 8;
+    const size_t mem_size = eewb * microVl;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    MISA misa = xc->readMiscReg(MISCREG_ISA);
+    STATUS status = xc->readMiscReg(MISCREG_STATUS);
+    if (!misa.rvv || status.vs == VPUStatus::OFF) {
+        return std::make_shared<IllegalInstFault>(
+            "RVV is disabled or VPU is off", machInst);
+    }
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+        panic_if((pow(2, vlmul) * this->numFields) > 8,
+            "LMUL value is illegal for vsseg inst");
+
+
+    const size_t micro_vlmax = vlen / width_EEW(machInst.width);
+
+    std::vector<bool> byte_enable(mem_size, false);
+    size_t ei;
+    for (size_t i = 0; i < microVl; i++) {
+        ei = i + micro_vlmax * microIdx;
+        if (machInst.vm || elem_mask_vseg(v0, ei + (field * microVl),
+            this->numFields)) {
+            %(memacc_code)s;
+            auto it = byte_enable.begin() + i * eewb;
+            std::fill(it, it + eewb, true);
+        }
+    }
+
+    Fault fault;
+    fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA, memAccessFlags,
+                         nullptr, byte_enable);
+    return fault;
+}
+
+}};
+
+def template VsSegMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+                            trace::InstRecord *traceData) const
+{
+    return NoFault;
+}
+
+}};
+
 def template VMemBaseDecodeBlock {{
     return new %(class_name)s(machInst, vlen);
 }};
diff --git a/src/arch/riscv/linux/fs_workload.hh b/src/arch/riscv/linux/fs_workload.hh
index c5fcd70eb0..73e93e0c79 100644
--- a/src/arch/riscv/linux/fs_workload.hh
+++ b/src/arch/riscv/linux/fs_workload.hh
@@ -32,6 +32,7 @@
 #include <string>
 
 #include "arch/riscv/remote_gdb.hh"
+#include "arch/riscv/semihosting.hh"
 #include "params/RiscvBootloaderKernelWorkload.hh"
 #include "params/RiscvLinux.hh"
 #include "sim/kernel_workload.hh"
@@ -51,11 +52,12 @@ class FsLinux : public KernelWorkload
      **/
     PCEvent *kernelPanicPcEvent = nullptr;
     PCEvent *kernelOopsPcEvent = nullptr;
+    RiscvSemihosting *semihosting = nullptr;
     void addExitOnKernelPanicEvent();
     void addExitOnKernelOopsEvent();
   public:
     PARAMS(RiscvLinux);
-    FsLinux(const Params &p) : KernelWorkload(p) {}
+    FsLinux(const Params &p) : KernelWorkload(p), semihosting(p.semihosting) {}
     ~FsLinux()
     {
         if (kernelPanicPcEvent != nullptr) {
@@ -78,6 +80,7 @@ class FsLinux : public KernelWorkload
     }
 
     ByteOrder byteOrder() const override { return ByteOrder::little; }
+    RiscvSemihosting *getSemihosting() const override { return semihosting; }
 };
 
 class BootloaderKernelWorkload: public Workload
@@ -89,6 +92,7 @@ class BootloaderKernelWorkload: public Workload
     loader::SymbolTable kernelSymbolTable;
     loader::SymbolTable bootloaderSymbolTable;
     const std::string bootArgs;
+    RiscvSemihosting *semihosting;
 
     /**
      * Event to halt the simulator if the kernel calls panic() or
@@ -109,7 +113,8 @@ class BootloaderKernelWorkload: public Workload
   public:
     PARAMS(RiscvBootloaderKernelWorkload);
     BootloaderKernelWorkload(const Params &p)
-        : Workload(p), entryPoint(p.entry_point), bootArgs(p.command_line)
+        : Workload(p), entryPoint(p.entry_point), bootArgs(p.command_line),
+          semihosting(p.semihosting)
     {
         loadBootloaderSymbolTable();
         loadKernelSymbolTable();
@@ -142,6 +147,8 @@ class BootloaderKernelWorkload: public Workload
 
     loader::Arch getArch() const override { return kernel->getArch(); }
 
+    RiscvSemihosting *getSemihosting() const override { return semihosting; }
+
     const loader::SymbolTable &
     symtab(ThreadContext *tc) override
     {
diff --git a/src/arch/riscv/linux/linux.hh b/src/arch/riscv/linux/linux.hh
index de8bccc85e..997eb6af4c 100644
--- a/src/arch/riscv/linux/linux.hh
+++ b/src/arch/riscv/linux/linux.hh
@@ -289,6 +289,10 @@ class RiscvLinux32 : public RiscvLinux, public OpenFlagTable<RiscvLinux32>
     // Currently time_t in glibc for riscv32 is 32-bits, but will be changed.
     typedef int64_t time_t;
 
+    // Linux types for RV32
+    typedef uint32_t size_t;
+    typedef int64_t off_t;
+
     /// Limit struct for getrlimit/setrlimit.
     struct rlimit
     {
diff --git a/src/arch/riscv/mmu.hh b/src/arch/riscv/mmu.hh
index f8afaa7380..ebe7e23153 100644
--- a/src/arch/riscv/mmu.hh
+++ b/src/arch/riscv/mmu.hh
@@ -54,7 +54,7 @@ namespace RiscvISA {
 class MMU : public BaseMMU
 {
   public:
-    PMAChecker *pma;
+    BasePMAChecker *pma;
 
     MMU(const RiscvMMUParams &p)
       : BaseMMU(p), pma(p.pma_checker)
@@ -94,6 +94,23 @@ class MMU : public BaseMMU
     {
         return static_cast<TLB*>(dtb)->pmp;
     }
+
+    /*
+     * The usage of Memory Request Arch Flags for RISC-V
+     *  | 7 ------------- 3 | 2 ------ 0 |
+     *  |     Reserved      |  LDST Size |
+     *  | ------------------| -----------|
+     */
+    enum RiscvFlags
+    {
+        ByteAlign = 0,
+        HalfWordAlign = 1,
+        WordAlign = 2,
+        DoubleWordAlign = 3,
+        QuadWordAlign = 4,
+
+        AlignmentMask = 0x7,
+    };
 };
 
 } // namespace RiscvISA
diff --git a/src/arch/riscv/pagetable.cc b/src/arch/riscv/pagetable.cc
index b2901e947d..b3cbf08299 100644
--- a/src/arch/riscv/pagetable.cc
+++ b/src/arch/riscv/pagetable.cc
@@ -60,5 +60,20 @@ TlbEntry::unserialize(CheckpointIn &cp)
     UNSERIALIZE_SCALAR(lruSeq);
 }
 
+Addr
+getVPNFromVAddr(Addr vaddr, Addr mode)
+{
+    switch (mode) {
+    case SV39:
+        return bits(vaddr, 38, 12);
+    case SV48:
+        return bits(vaddr, 47, 12);
+    case SV57:
+        return bits(vaddr, 56, 12);
+    default:
+        panic("Unknown address translation mode %d\n", mode);
+    }
+}
+
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/pagetable.hh b/src/arch/riscv/pagetable.hh
index 06a054faa9..3a3164fddf 100644
--- a/src/arch/riscv/pagetable.hh
+++ b/src/arch/riscv/pagetable.hh
@@ -53,6 +53,7 @@ enum AddrXlateMode
     BARE = 0,
     SV39 = 8,
     SV48 = 9,
+    SV57 = 10,
 };
 
 // Sv39 paging
@@ -76,6 +77,13 @@ BitUnion64(PTESv39)
     Bitfield<0> v;
 EndBitUnion(PTESv39)
 
+/**
+ * Remove the page offset and the upper bits that are
+ * not part of the VPN from the address.
+ * Note that this must assume the smallest page size
+ */
+Addr getVPNFromVAddr(Addr vaddr, Addr mode);
+
 struct TlbEntry;
 typedef Trie<Addr, TlbEntry> TlbEntryTrie;
 
diff --git a/src/arch/riscv/pagetable_walker.cc b/src/arch/riscv/pagetable_walker.cc
index cbd5bd2b22..98180e480c 100644
--- a/src/arch/riscv/pagetable_walker.cc
+++ b/src/arch/riscv/pagetable_walker.cc
@@ -200,8 +200,20 @@ Walker::startWalkWrapper()
 {
     unsigned num_squashed = 0;
     WalkerState *currState = currStates.front();
+
+    // check if we get a tlb hit to skip the walk
+    Addr vaddr = Addr(sext<VADDR_BITS>(currState->req->getVaddr()));
+    Addr vpn = getVPNFromVAddr(vaddr, currState->satp.mode);
+    TlbEntry *e = tlb->lookup(vpn, currState->satp.asid, currState->mode,
+                              true);
+    Fault fault = NoFault;
+    if (e) {
+       fault = tlb->checkPermissions(currState->status, currState->pmode,
+                                     vaddr, currState->mode, e->pte);
+    }
+
     while ((num_squashed < numSquashable) && currState &&
-        currState->translation->squashed()) {
+           (currState->translation->squashed() || (e && fault == NoFault))) {
         currStates.pop_front();
         num_squashed++;
 
@@ -209,9 +221,14 @@ Walker::startWalkWrapper()
             currState->req->getVaddr());
 
         // finish the translation which will delete the translation object
-        currState->translation->finish(
-            std::make_shared<UnimpFault>("Squashed Inst"),
-            currState->req, currState->tc, currState->mode);
+        if (currState->translation->squashed()) {
+            currState->translation->finish(
+                std::make_shared<UnimpFault>("Squashed Inst"),
+                currState->req, currState->tc, currState->mode);
+        } else {
+            tlb->translateTiming(currState->req, currState->tc,
+                                 currState->translation, currState->mode);
+        }
 
         // delete the current request if there are no inflight packets.
         // if there is something in flight, delete when the packets are
@@ -223,13 +240,27 @@ Walker::startWalkWrapper()
         }
 
         // check the next translation request, if it exists
-        if (currStates.size())
+        if (currStates.size()) {
             currState = currStates.front();
-        else
+            vaddr = Addr(sext<VADDR_BITS>(currState->req->getVaddr()));
+            Addr vpn = getVPNFromVAddr(vaddr, currState->satp.mode);
+            e = tlb->lookup(vpn, currState->satp.asid, currState->mode,
+                            true);
+            if (e) {
+               fault = tlb->checkPermissions(currState->status,
+                                             currState->pmode, vaddr,
+                                             currState->mode, e->pte);
+            }
+        } else {
             currState = NULL;
+        }
+    }
+    if (currState && !currState->wasStarted()) {
+        if (!e || fault != NoFault)
+            currState->startWalk();
+        else
+            schedule(startWalkWrapperEvent, clockEdge(Cycles(1)));
     }
-    if (currState && !currState->wasStarted())
-        currState->startWalk();
 }
 
 Fault
@@ -302,12 +333,15 @@ Walker::WalkerState::stepWalk(PacketPtr &write)
     // step 2:
     // Performing PMA/PMP checks on physical address of PTE
 
-    walker->pma->check(read->req);
     // Effective privilege mode for pmp checks for page table
     // walks is S mode according to specs
     fault = walker->pmp->pmpCheck(read->req, BaseMMU::Read,
                     RiscvISA::PrivilegeMode::PRV_S, tc, entry.vaddr);
 
+    if (fault == NoFault) {
+        fault = walker->pma->check(read->req, BaseMMU::Read, entry.vaddr);
+    }
+
     if (fault == NoFault) {
         // step 3:
         if (!pte.v || (!pte.r && pte.w)) {
@@ -354,14 +388,20 @@ Walker::WalkerState::stepWalk(PacketPtr &write)
                         // this read will eventually become write
                         // if doWrite is True
 
-                        walker->pma->check(read->req);
-
                         fault = walker->pmp->pmpCheck(read->req,
                                             BaseMMU::Write, pmode, tc, entry.vaddr);
 
+                        if (fault == NoFault) {
+                            fault = walker->pma->check(read->req,
+                                                BaseMMU::Write, entry.vaddr);
+                        }
+
                     }
                     // perform step 8 only if pmp checks pass
                     if (fault == NoFault) {
+                        DPRINTF(PageTableWalker,
+                                "#0 leaf node at level %d, with vpn %#x\n",
+                                 level, entry.vaddr);
 
                         // step 8
                         entry.logBytes = PageShift + (level * LEVEL_BITS);
@@ -374,6 +414,17 @@ Walker::WalkerState::stepWalk(PacketPtr &write)
                         if (!pte.d && mode != BaseMMU::Write)
                             entry.pte.w = 0;
                         doTLBInsert = true;
+
+                        // Update statistics for completed page walks
+                        if (level == 1) {
+                            walker->pagewalkerstats.num_2mb_walks++;
+                        }
+                        if (level == 0) {
+                            walker->pagewalkerstats.num_4kb_walks++;
+                        }
+                        DPRINTF(PageTableWalker,
+                                "#1 leaf node at level %d, with vpn %#x\n",
+                                level, entry.vaddr);
                     }
                 }
             } else {
@@ -412,9 +463,10 @@ Walker::WalkerState::stepWalk(PacketPtr &write)
         }
 
         if (doTLBInsert) {
-            if (!functional)
-                walker->tlb->insert(entry.vaddr, entry);
-            else {
+            if (!functional) {
+                Addr vpn = getVPNFromVAddr(entry.vaddr, satp.mode);
+                walker->tlb->insert(vpn, entry);
+            } else {
                 DPRINTF(PageTableWalker, "Translated %#x -> %#x\n",
                         entry.vaddr, entry.paddr << PageShift |
                         (entry.vaddr & mask(entry.logBytes)));
@@ -522,15 +574,19 @@ Walker::WalkerState::recvPacket(PacketPtr pkt)
              */
             Addr vaddr = req->getVaddr();
             vaddr = Addr(sext<VADDR_BITS>(vaddr));
-            Addr paddr = walker->tlb->translateWithTLB(vaddr, satp.asid, mode);
+            Addr paddr = walker->tlb->translateWithTLB(vaddr, satp.asid,
+                                                       satp.mode, mode);
             req->setPaddr(paddr);
-            walker->pma->check(req);
 
             // do pmp check if any checking condition is met.
             // timingFault will be NoFault if pmp checks are
             // passed, otherwise an address fault will be returned.
             timingFault = walker->pmp->pmpCheck(req, mode, pmode, tc);
 
+            if (timingFault == NoFault) {
+                timingFault = walker->pma->check(req, mode);
+            }
+
             // Let the CPU continue.
             translation->finish(timingFault, req, tc, mode);
         } else {
@@ -620,5 +676,14 @@ Walker::WalkerState::pageFault(bool present)
     return walker->tlb->createPagefault(entry.vaddr, mode);
 }
 
+Walker::PagewalkerStats::PagewalkerStats(statistics::Group *parent)
+  : statistics::Group(parent),
+    ADD_STAT(num_4kb_walks, statistics::units::Count::get(),
+             "Completed page walks with 4KB pages"),
+    ADD_STAT(num_2mb_walks, statistics::units::Count::get(),
+             "Completed page walks with 2MB pages")
+{
+}
+
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/pagetable_walker.hh b/src/arch/riscv/pagetable_walker.hh
index b12b263403..15441a69d5 100644
--- a/src/arch/riscv/pagetable_walker.hh
+++ b/src/arch/riscv/pagetable_walker.hh
@@ -46,6 +46,7 @@
 #include "arch/riscv/pma_checker.hh"
 #include "arch/riscv/pmp.hh"
 #include "arch/riscv/tlb.hh"
+#include "base/statistics.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
 #include "params/RiscvPagetableWalker.hh"
@@ -173,7 +174,7 @@ namespace RiscvISA
         // The TLB we're supposed to load.
         TLB * tlb;
         System * sys;
-        PMAChecker * pma;
+        BasePMAChecker * pma;
         PMP * pmp;
         RequestorID requestorId;
 
@@ -193,6 +194,16 @@ namespace RiscvISA
         void recvReqRetry();
         bool sendTiming(WalkerState * sendingState, PacketPtr pkt);
 
+        struct PagewalkerStats : public statistics::Group
+        {
+            PagewalkerStats(statistics::Group *parent);
+
+            statistics::Scalar num_4kb_walks;
+            statistics::Scalar num_2mb_walks;
+
+        } pagewalkerstats;
+
+
       public:
 
         void setTLB(TLB * _tlb)
@@ -209,7 +220,8 @@ namespace RiscvISA
             pmp(params.pmp),
             requestorId(sys->getRequestorId(this)),
             numSquashable(params.num_squash_per_cycle),
-            startWalkWrapperEvent([this]{ startWalkWrapper(); }, name())
+            startWalkWrapperEvent([this]{ startWalkWrapper(); }, name()),
+            pagewalkerstats(this)
         {
         }
     };
diff --git a/src/arch/riscv/pma_checker.cc b/src/arch/riscv/pma_checker.cc
index a64b387bca..1655a7accd 100644
--- a/src/arch/riscv/pma_checker.cc
+++ b/src/arch/riscv/pma_checker.cc
@@ -37,6 +37,8 @@
 
 #include "arch/riscv/pma_checker.hh"
 
+#include "arch/riscv/faults.hh"
+#include "arch/riscv/mmu.hh"
 #include "base/addr_range.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
@@ -47,18 +49,48 @@
 namespace gem5
 {
 
+namespace RiscvISA
+{
+
 PMAChecker::PMAChecker(const Params &params) :
-SimObject(params),
+BasePMAChecker(params),
 uncacheable(params.uncacheable.begin(), params.uncacheable.end())
 {
+    for (auto& range: params.misaligned) {
+        misaligned.insert(range, true);
+    }
 }
 
-void
-PMAChecker::check(const RequestPtr &req)
+Fault
+PMAChecker::check(const RequestPtr &req, BaseMMU::Mode mode, Addr vaddr)
 {
     if (isUncacheable(req->getPaddr(), req->getSize())) {
         req->setFlags(Request::UNCACHEABLE | Request::STRICT_ORDER);
     }
+
+    return hasMisaligned() ? checkPAddrAlignment(req, mode, vaddr) : NoFault;
+}
+
+Fault
+PMAChecker::checkVAddrAlignment(
+    const RequestPtr &req, BaseMMU::Mode mode)
+{
+    // We need to translate address before alignment check
+    // if there are some memory ranges support misaligned load/store
+    if (hasMisaligned()) {
+        return NoFault;
+    }
+
+    // Ingore alignment check for instruction fetching
+    if (mode == BaseMMU::Execute) {
+        return NoFault;
+    }
+    assert(req->hasVaddr());
+    Addr alignSize = mask(req->getArchFlags() & MMU::AlignmentMask) + 1;
+    if (addressAlign(req->getVaddr(), alignSize)) {
+        return NoFault;
+    }
+    return createMisalignFault(req->getVaddr(), mode);
 }
 
 bool
@@ -86,9 +118,69 @@ PMAChecker::isUncacheable(PacketPtr pkt)
 }
 
 void
-PMAChecker::takeOverFrom(PMAChecker *old)
+PMAChecker::takeOverFrom(BasePMAChecker *old)
 {
-    uncacheable = old->uncacheable;
+    PMAChecker* derived_old = dynamic_cast<PMAChecker*>(old);
+    assert(derived_old != nullptr);
+    uncacheable = derived_old->uncacheable;
+    misaligned = derived_old->misaligned;
 }
 
+Fault
+PMAChecker::checkPAddrAlignment(
+    const RequestPtr &req, BaseMMU::Mode mode, Addr vaddr)
+{
+    Addr paddr = 0;
+    // Ingore alignment check for instruction fetching
+    if (mode == BaseMMU::Execute) {
+        return NoFault;
+    }
+    assert(req->hasPaddr());
+    paddr = req->getPaddr();
+    Addr alignSize = mask(req->getArchFlags() & MMU::AlignmentMask) + 1;
+    if (addressAlign(paddr, alignSize)) {
+        return NoFault;
+    }
+    if (misalignedSupport(RangeSize(paddr, req->getSize()))){
+        return NoFault;
+    }
+    return createMisalignFault(
+        (req->hasVaddr() ? req->getVaddr() : vaddr), mode);
+}
+
+Fault
+PMAChecker::createMisalignFault(Addr vaddr, BaseMMU::Mode mode)
+{
+    RiscvISA::ExceptionCode code;
+    switch (mode) {
+      case BaseMMU::Read:
+        code = ExceptionCode::LOAD_ADDR_MISALIGNED;
+        break;
+      case BaseMMU::Write:
+        code = ExceptionCode::STORE_ADDR_MISALIGNED;
+        break;
+      default:
+        panic("Execute mode request should not reach here.");
+    }
+    return std::make_shared<AddressFault>(vaddr, code);
+}
+
+bool
+PMAChecker::addressAlign(const Addr addr, const Addr size) {
+    return (addr & (size - 1)) == 0;
+}
+
+bool
+PMAChecker::misalignedSupport(const AddrRange &range)
+{
+    return misaligned.contains(range) != misaligned.end();
+}
+
+bool
+PMAChecker::hasMisaligned()
+{
+    return !misaligned.empty();
+}
+
+} // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/pma_checker.hh b/src/arch/riscv/pma_checker.hh
index 08e80519bb..5cd4b0dda4 100644
--- a/src/arch/riscv/pma_checker.hh
+++ b/src/arch/riscv/pma_checker.hh
@@ -38,26 +38,45 @@
 #ifndef __ARCH_RISCV_PMA_CHECKER_HH__
 #define __ARCH_RISCV_PMA_CHECKER_HH__
 
+#include "arch/generic/mmu.hh"
 #include "base/addr_range.hh"
+#include "base/addr_range_map.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
+#include "params/BasePMAChecker.hh"
 #include "params/PMAChecker.hh"
 #include "sim/sim_object.hh"
 
 namespace gem5
 {
 
+namespace RiscvISA
+{
+
 /**
  * Based on the RISC-V ISA privileged specifications
  * V1.11, there is no implementation guidelines on the
  * Physical Memory Attributes.
- *
+ */
+
+class BasePMAChecker : public SimObject
+{
+  public:
+    BasePMAChecker(const BasePMACheckerParams &params) : SimObject(params) {};
+    virtual Fault check(
+        const RequestPtr &req, BaseMMU::Mode mode, Addr vaddr = 0) = 0;
+    virtual Fault checkVAddrAlignment(
+        const RequestPtr &req, BaseMMU::Mode mode) = 0;
+    virtual void takeOverFrom(BasePMAChecker *old) = 0;
+};
+
+/**
  * This class provides an abstract PMAChecker for RISC-V
  * to provide PMA checking functionality. However,
  * hardware latencies are not modelled.
  */
 
-class PMAChecker : public SimObject
+class PMAChecker : public BasePMAChecker
 {
   public:
 
@@ -70,17 +89,54 @@ class PMAChecker : public SimObject
     }
     PMAChecker(const Params &params);
 
-    AddrRangeList uncacheable;
+    /*
+     * Check if any exception for given address
+     */
+    Fault check(
+        const RequestPtr &req, BaseMMU::Mode mode, Addr vaddr = 0) override;
 
-    void check(const RequestPtr &req);
+    /*
+     * Check alignment for virtual address
+     */
+    Fault checkVAddrAlignment(
+        const RequestPtr &req, BaseMMU::Mode mode) override;
 
     bool isUncacheable(const AddrRange &range);
     bool isUncacheable(const Addr &addr, const unsigned size);
     bool isUncacheable(PacketPtr pkt);
 
-    void takeOverFrom(PMAChecker *old);
+    void takeOverFrom(BasePMAChecker *old) override;
+
+  protected:
+    /*
+     * Check alignment for physical address
+     */
+    Fault checkPAddrAlignment(
+        const RequestPtr &req, BaseMMU::Mode mode, Addr vaddr);
+
+    /*
+     * Create address-misaligned exception based on the MMU mode and
+     * virtual address
+     */
+    Fault createMisalignFault(Addr vaddr, BaseMMU::Mode mode);
+
+    inline bool addressAlign(const Addr addr, const Addr size);
+
+    /*
+     * Check if the address range support misaligned load/store
+     */
+    inline bool misalignedSupport(const AddrRange &range);
+
+    /*
+     * Check if there is any region support misaligned load/store
+     */
+    inline bool hasMisaligned();
+
+    AddrRangeList uncacheable;
+    AddrRangeMap<bool, 3> misaligned;
 };
 
+} // namespace RiscvISA
 } // namespace gem5
 
 #endif // __ARCH_RISCV_PMA_CHECKER_HH__
diff --git a/src/arch/riscv/pmp.cc b/src/arch/riscv/pmp.cc
index 8fa1ca3cdb..0b2738451a 100644
--- a/src/arch/riscv/pmp.cc
+++ b/src/arch/riscv/pmp.cc
@@ -44,6 +44,9 @@
 namespace gem5
 {
 
+namespace RiscvISA
+{
+
 PMP::PMP(const Params &params) :
     SimObject(params),
     pmpEntries(params.pmp_entries),
@@ -55,8 +58,7 @@ PMP::PMP(const Params &params) :
 
 Fault
 PMP::pmpCheck(const RequestPtr &req, BaseMMU::Mode mode,
-              RiscvISA::PrivilegeMode pmode, ThreadContext *tc,
-              Addr vaddr)
+              PrivilegeMode pmode, ThreadContext *tc, Addr vaddr)
 {
     // First determine if pmp table should be consulted
     if (!shouldCheckPMP(pmode, tc))
@@ -91,7 +93,7 @@ PMP::pmpCheck(const RequestPtr &req, BaseMMU::Mode mode,
             && (PMP_OFF != pmpGetAField(pmpTable[match_index].pmpCfg))) {
             uint8_t this_cfg = pmpTable[match_index].pmpCfg;
 
-            if ((pmode == RiscvISA::PrivilegeMode::PRV_M) &&
+            if ((pmode == PrivilegeMode::PRV_M) &&
                                     (PMP_LOCK & this_cfg) == 0) {
                 return NoFault;
             } else if ((mode == BaseMMU::Mode::Read) &&
@@ -113,7 +115,7 @@ PMP::pmpCheck(const RequestPtr &req, BaseMMU::Mode mode,
         }
     }
     // if no entry matched and we are not in M mode return fault
-    if (pmode == RiscvISA::PrivilegeMode::PRV_M) {
+    if (pmode == PrivilegeMode::PRV_M) {
         return NoFault;
     } else if (req->hasVaddr()) {
         return createAddrfault(req->getVaddr(), mode);
@@ -125,16 +127,16 @@ PMP::pmpCheck(const RequestPtr &req, BaseMMU::Mode mode,
 Fault
 PMP::createAddrfault(Addr vaddr, BaseMMU::Mode mode)
 {
-    RiscvISA::ExceptionCode code;
+    ExceptionCode code;
     if (mode == BaseMMU::Read) {
-        code = RiscvISA::ExceptionCode::LOAD_ACCESS;
+        code = ExceptionCode::LOAD_ACCESS;
     } else if (mode == BaseMMU::Write) {
-        code = RiscvISA::ExceptionCode::STORE_ACCESS;
+        code = ExceptionCode::STORE_ACCESS;
     } else {
-        code = RiscvISA::ExceptionCode::INST_ACCESS;
+        code = ExceptionCode::INST_ACCESS;
     }
     warn("pmp access fault.\n");
-    return std::make_shared<RiscvISA::AddressFault>(vaddr, code);
+    return std::make_shared<AddressFault>(vaddr, code);
 }
 
 inline uint8_t
@@ -270,14 +272,13 @@ PMP::pmpUpdateAddr(uint32_t pmp_index, Addr this_addr)
 }
 
 bool
-PMP::shouldCheckPMP(RiscvISA::PrivilegeMode pmode, ThreadContext *tc)
+PMP::shouldCheckPMP(PrivilegeMode pmode, ThreadContext *tc)
 {
     // The privilege mode of memory read and write
     // is modified by TLB. It can just simply check if
     // the numRule is not zero, then return true if
     // privilege mode is not M or has any lock entry
-    return numRules != 0 && (
-        pmode != RiscvISA::PrivilegeMode::PRV_M || hasLockEntry);
+    return numRules != 0 && (pmode != PrivilegeMode::PRV_M || hasLockEntry);
 }
 
 AddrRange
@@ -298,4 +299,5 @@ PMP::pmpDecodeNapot(Addr pmpaddr)
     }
 }
 
+} // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/pmp.hh b/src/arch/riscv/pmp.hh
index ff8c4fc1b9..ee14af1594 100644
--- a/src/arch/riscv/pmp.hh
+++ b/src/arch/riscv/pmp.hh
@@ -46,6 +46,9 @@
 namespace gem5
 {
 
+namespace RiscvISA
+{
+
 /**
  * This class helps to implement RISCV's physical memory
  * protection (pmp) primitive.
@@ -125,7 +128,7 @@ class PMP : public SimObject
      * @return Fault.
      */
     Fault pmpCheck(const RequestPtr &req, BaseMMU::Mode mode,
-                  RiscvISA::PrivilegeMode pmode, ThreadContext *tc,
+                  PrivilegeMode pmode, ThreadContext *tc,
                   Addr vaddr = 0);
 
     /**
@@ -163,7 +166,7 @@ class PMP : public SimObject
      * @param tc thread context.
      * @return true or false.
      */
-    bool shouldCheckPMP(RiscvISA::PrivilegeMode pmode, ThreadContext *tc);
+    bool shouldCheckPMP(PrivilegeMode pmode, ThreadContext *tc);
 
     /**
      * createAddrfault creates an address fault
@@ -205,6 +208,7 @@ class PMP : public SimObject
 
 };
 
+} // namespace RiscvISA
 } // namespace gem5
 
 #endif // __ARCH_RISCV_PMP_HH__
diff --git a/src/arch/riscv/reg_abi.hh b/src/arch/riscv/reg_abi.hh
index 4c965321f7..383e9df266 100644
--- a/src/arch/riscv/reg_abi.hh
+++ b/src/arch/riscv/reg_abi.hh
@@ -54,8 +54,9 @@ struct RegABI32 : public GenericSyscallABI32
 namespace guest_abi
 {
 
+
 // This method will be used if the size of argument type of function is
-// greater than 4 for Riscv 32.
+// greater than 4 byte for Riscv 32.
 template <typename ABI, typename Arg>
 struct Argument<ABI, Arg,
     typename std::enable_if_t<
@@ -68,7 +69,27 @@ struct Argument<ABI, Arg,
     {
         panic_if(state >= ABI::ArgumentRegs.size(),
                 "Ran out of syscall argument registers.");
-        return bits(tc->getReg(ABI::ArgumentRegs[state++]), 31, 0);
+
+        auto low = ABI::ArgumentRegs[state++];
+        auto high = ABI::ArgumentRegs[state++];
+        return (Arg)ABI::mergeRegs(tc, low, high);
+    }
+};
+
+// This method will be used for RV32 pointers.
+template <>
+struct Argument<RiscvISA::RegABI32, pseudo_inst::GuestAddr>
+{
+    using ABI = RiscvISA::RegABI32;
+    using Arg = pseudo_inst::GuestAddr;
+
+    static Arg
+    get(ThreadContext *tc, typename ABI::State &state)
+    {
+        panic_if(state >= ABI::ArgumentRegs.size(),
+                "Ran out of syscall argument registers.");
+
+        return (Arg)bits(tc->getReg(ABI::ArgumentRegs[state++]), 31, 0);
     }
 };
 
diff --git a/src/arch/riscv/regs/float.hh b/src/arch/riscv/regs/float.hh
index cca9e1be2f..0b4570bc71 100644
--- a/src/arch/riscv/regs/float.hh
+++ b/src/arch/riscv/regs/float.hh
@@ -211,6 +211,13 @@ const std::vector<std::string> RegNames = {
 
 } // namespace float_reg
 
+inline float16_t
+fsgnj16(float16_t a, float16_t b, bool n, bool x) {
+    if (n) b.v = ~b.v;
+    else if (x) b.v = a.v ^ b.v;
+    return f16(insertBits(b.v, 14, 0, a.v));
+}
+
 inline float32_t
 fsgnj32(float32_t a, float32_t b, bool n, bool x) {
     if (n) b.v = ~b.v;
diff --git a/src/arch/riscv/regs/misc.hh b/src/arch/riscv/regs/misc.hh
index 96e88c7438..9985c20364 100644
--- a/src/arch/riscv/regs/misc.hh
+++ b/src/arch/riscv/regs/misc.hh
@@ -17,6 +17,7 @@
  *
  * Copyright (c) 2016 RISC-V Foundation
  * Copyright (c) 2016 The University of Virginia
+ * Copyright (c) 2024 University of Rostock
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -245,6 +246,21 @@ enum MiscRegIndex
     MISCREG_HPMCOUNTER30H,
     MISCREG_HPMCOUNTER31H,
 
+    NUM_PHYS_MISCREGS,
+
+    MISCREG_MSTATUS = MISCREG_STATUS,
+    MISCREG_MIP = MISCREG_IP,
+    MISCREG_MIE = MISCREG_IE,
+    // This CSR shared the same space with MISCREG_FFLAGS
+    MISCREG_FFLAGS_EXE = NUM_PHYS_MISCREGS,
+    MISCREG_FCSR,
+    MISCREG_USTATUS,
+    MISCREG_UIP,
+    MISCREG_UIE,
+    MISCREG_SSTATUS,
+    MISCREG_SIP,
+    MISCREG_SIE,
+
     NUM_MISCREGS
 };
 
@@ -519,10 +535,10 @@ constexpr uint64_t isaExtsFlags() {
 
 const std::unordered_map<int, CSRMetadata> CSRData = {
     {CSR_USTATUS,
-        {"ustatus", MISCREG_STATUS, rvTypeFlags(RV64, RV32),
+        {"ustatus", MISCREG_USTATUS, rvTypeFlags(RV64, RV32),
          isaExtsFlags('n')}},
     {CSR_UIE,
-        {"uie", MISCREG_IE, rvTypeFlags(RV64, RV32), isaExtsFlags('n')}},
+        {"uie", MISCREG_UIE, rvTypeFlags(RV64, RV32), isaExtsFlags('n')}},
     {CSR_UTVEC,
         {"utvec", MISCREG_UTVEC, rvTypeFlags(RV64, RV32), isaExtsFlags('n')}},
     {CSR_USCRATCH,
@@ -536,15 +552,14 @@ const std::unordered_map<int, CSRMetadata> CSRData = {
     {CSR_UTVAL,
         {"utval", MISCREG_UTVAL, rvTypeFlags(RV64, RV32), isaExtsFlags('n')}},
     {CSR_UIP,
-        {"uip", MISCREG_IP, rvTypeFlags(RV64, RV32), isaExtsFlags('n')}},
+        {"uip", MISCREG_UIP, rvTypeFlags(RV64, RV32), isaExtsFlags('n')}},
     {CSR_FFLAGS,
         {"fflags", MISCREG_FFLAGS, rvTypeFlags(RV64, RV32),
          isaExtsFlags('f')}},
     {CSR_FRM,
         {"frm", MISCREG_FRM, rvTypeFlags(RV64, RV32), isaExtsFlags('f')}},
-     // Actually FRM << 5 | FFLAGS
     {CSR_FCSR,
-        {"fcsr", MISCREG_FFLAGS, rvTypeFlags(RV64, RV32), isaExtsFlags('f')}},
+        {"fcsr", MISCREG_FCSR, rvTypeFlags(RV64, RV32), isaExtsFlags('f')}},
     {CSR_CYCLE,
         {"cycle", MISCREG_CYCLE, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
     {CSR_TIME,
@@ -733,7 +748,7 @@ const std::unordered_map<int, CSRMetadata> CSRData = {
          isaExtsFlags()}},
 
     {CSR_SSTATUS,
-        {"sstatus", MISCREG_STATUS, rvTypeFlags(RV64, RV32),
+        {"sstatus", MISCREG_SSTATUS, rvTypeFlags(RV64, RV32),
          isaExtsFlags('s')}},
     {CSR_SEDELEG,
         {"sedeleg", MISCREG_SEDELEG, rvTypeFlags(RV64, RV32),
@@ -742,7 +757,7 @@ const std::unordered_map<int, CSRMetadata> CSRData = {
         {"sideleg", MISCREG_SIDELEG, rvTypeFlags(RV64, RV32),
          isaExtsFlags('s')}},
     {CSR_SIE,
-        {"sie", MISCREG_IE, rvTypeFlags(RV64, RV32), isaExtsFlags('s')}},
+        {"sie", MISCREG_SIE, rvTypeFlags(RV64, RV32), isaExtsFlags('s')}},
     {CSR_STVEC,
         {"stvec", MISCREG_STVEC, rvTypeFlags(RV64, RV32), isaExtsFlags('s')}},
     {CSR_SCOUNTEREN,
@@ -759,7 +774,7 @@ const std::unordered_map<int, CSRMetadata> CSRData = {
     {CSR_STVAL,
         {"stval", MISCREG_STVAL, rvTypeFlags(RV64, RV32), isaExtsFlags('s')}},
     {CSR_SIP,
-        {"sip", MISCREG_IP, rvTypeFlags(RV64, RV32), isaExtsFlags('s')}},
+        {"sip", MISCREG_SIP, rvTypeFlags(RV64, RV32), isaExtsFlags('s')}},
     {CSR_SATP,
         {"satp", MISCREG_SATP, rvTypeFlags(RV64, RV32), isaExtsFlags('s')}},
 
@@ -773,7 +788,7 @@ const std::unordered_map<int, CSRMetadata> CSRData = {
     {CSR_MHARTID,
         {"mhartid", MISCREG_HARTID, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
     {CSR_MSTATUS,
-        {"mstatus", MISCREG_STATUS, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
+        {"mstatus", MISCREG_MSTATUS, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
     {CSR_MISA,
         {"misa", MISCREG_ISA, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
     {CSR_MEDELEG,
@@ -781,7 +796,7 @@ const std::unordered_map<int, CSRMetadata> CSRData = {
     {CSR_MIDELEG,
         {"mideleg", MISCREG_MIDELEG, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
     {CSR_MIE,
-        {"mie", MISCREG_IE, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
+        {"mie", MISCREG_MIE, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
     {CSR_MTVEC,
         {"mtvec", MISCREG_MTVEC, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
     {CSR_MCOUNTEREN,
@@ -799,7 +814,7 @@ const std::unordered_map<int, CSRMetadata> CSRData = {
     {CSR_MTVAL,
         {"mtval", MISCREG_MTVAL, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
     {CSR_MIP,
-        {"mip", MISCREG_IP, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
+        {"mip", MISCREG_MIP, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
     {CSR_PMPCFG0,
         {"pmpcfg0", MISCREG_PMPCFG0, rvTypeFlags(RV64, RV32), isaExtsFlags()}},
     // pmpcfg1 rv32 only
@@ -1233,6 +1248,7 @@ EndBitUnion(MISA)
  * this bit union.
  */
 BitUnion64(INTERRUPT)
+    Bitfield<63,16> local;
     Bitfield<11> mei;
     Bitfield<9> sei;
     Bitfield<8> uei;
@@ -1438,6 +1454,7 @@ USTATUS_MASKS[enums::Num_RiscvType][enums::Num_PrivilegeModeSet] = {
     },
 };
 
+const RegVal LOCAL_MASK = mask(63,16);
 const RegVal MEI_MASK = 1ULL << 11;
 const RegVal SEI_MASK = 1ULL << 9;
 const RegVal UEI_MASK = 1ULL << 8;
@@ -1448,13 +1465,13 @@ const RegVal MSI_MASK = 1ULL << 3;
 const RegVal SSI_MASK = 1ULL << 1;
 const RegVal USI_MASK = 1ULL << 0;
 const RegVal MI_MASK[enums::Num_PrivilegeModeSet] = {
-    [enums::M] = MEI_MASK| MTI_MASK | MSI_MASK,
-    [enums::MU] = MEI_MASK| MTI_MASK | MSI_MASK,
-    [enums::MNU] = MEI_MASK | UEI_MASK | MTI_MASK | UTI_MASK |
+    [enums::M] = LOCAL_MASK | MEI_MASK| MTI_MASK | MSI_MASK,
+    [enums::MU] = LOCAL_MASK | MEI_MASK| MTI_MASK | MSI_MASK,
+    [enums::MNU] = LOCAL_MASK | MEI_MASK | UEI_MASK | MTI_MASK | UTI_MASK |
                    MSI_MASK | USI_MASK,
-    [enums::MSU] = MEI_MASK | SEI_MASK | MTI_MASK | STI_MASK |
+    [enums::MSU] = LOCAL_MASK | MEI_MASK | SEI_MASK | MTI_MASK | STI_MASK |
                    MSI_MASK | SSI_MASK,
-    [enums::MNSU] = MEI_MASK | SEI_MASK | UEI_MASK |
+    [enums::MNSU] = LOCAL_MASK | MEI_MASK | SEI_MASK | UEI_MASK |
                     MTI_MASK | STI_MASK | UTI_MASK |
                     MSI_MASK | SSI_MASK | USI_MASK,
 };
diff --git a/src/arch/riscv/remote_gdb.cc b/src/arch/riscv/remote_gdb.cc
index 661fe2e2ca..03d51eddb2 100644
--- a/src/arch/riscv/remote_gdb.cc
+++ b/src/arch/riscv/remote_gdb.cc
@@ -253,7 +253,7 @@ RemoteGDB::Riscv32GdbRegCache::getRegs(ThreadContext *context)
         CSRData.at(CSR_FFLAGS).physIndex) & RVxCSRMasks.at(CSR_FFLAGS);
     r.frm = context->readMiscRegNoEffect(
         CSRData.at(CSR_FRM).physIndex) & RVxCSRMasks.at(CSR_FRM);
-    r.fcsr = context->readMiscRegNoEffect(
+    r.fcsr = context->readMiscReg(
         CSRData.at(CSR_FCSR).physIndex) & RVxCSRMasks.at(CSR_FCSR);
 
     // CSR registers
@@ -363,7 +363,7 @@ RemoteGDB::Riscv32GdbRegCache::setRegs(ThreadContext *context) const
 
     setRegNoEffectWithMask(context, RV32, pms, CSR_FFLAGS, r.fflags);
     setRegNoEffectWithMask(context, RV32, pms, CSR_FRM, r.frm);
-    setRegNoEffectWithMask(context, RV32, pms, CSR_FCSR, r.fcsr);
+    setRegWithMask(context, RV32, pms, CSR_FCSR, r.fcsr);
 
     // TODO: implement CSR counter registers for mcycle(h), minstret(h)
 
@@ -450,7 +450,7 @@ RemoteGDB::Riscv64GdbRegCache::getRegs(ThreadContext *context)
         CSRData.at(CSR_FFLAGS).physIndex) & RVxCSRMasks.at(CSR_FFLAGS);
     r.frm = context->readMiscRegNoEffect(
         CSRData.at(CSR_FRM).physIndex) & RVxCSRMasks.at(CSR_FRM);
-    r.fcsr = context->readMiscRegNoEffect(
+    r.fcsr = context->readMiscReg(
         CSRData.at(CSR_FCSR).physIndex) & RVxCSRMasks.at(CSR_FCSR);
 
     // CSR registers
@@ -554,7 +554,7 @@ RemoteGDB::Riscv64GdbRegCache::setRegs(ThreadContext *context) const
 
     setRegNoEffectWithMask(context, RV64, pms, CSR_FFLAGS, r.fflags);
     setRegNoEffectWithMask(context, RV64, pms, CSR_FRM, r.frm);
-    setRegNoEffectWithMask(context, RV64, pms, CSR_FCSR, r.fcsr);
+    setRegWithMask(context, RV64, pms, CSR_FCSR, r.fcsr);
 
     // TODO: implement CSR counter registers for mcycle, minstret
 
diff --git a/src/arch/riscv/semihosting.cc b/src/arch/riscv/semihosting.cc
new file mode 100644
index 0000000000..05b80e5742
--- /dev/null
+++ b/src/arch/riscv/semihosting.cc
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2018, 2019 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/riscv/semihosting.hh"
+
+#include <unistd.h>
+
+#include <cerrno>
+#include <cstdio>
+
+#include "arch/riscv/isa.hh"
+#include "arch/riscv/page_size.hh"
+#include "base/logging.hh"
+#include "base/output.hh"
+#include "base/time.hh"
+#include "cpu/exec_context.hh"
+#include "debug/Semihosting.hh"
+#include "dev/serial/serial.hh"
+#include "mem/physical.hh"
+#include "mem/se_translating_port_proxy.hh"
+#include "mem/translating_port_proxy.hh"
+#include "params/RiscvSemihosting.hh"
+#include "pcstate.hh"
+#include "sim/byteswap.hh"
+#include "sim/full_system.hh"
+#include "sim/pseudo_inst.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+const std::map<uint32_t, RiscvSemihosting::SemiCall> RiscvSemihosting::calls{
+        {SYS_OPEN, {"SYS_OPEN", &RiscvSemihosting::callOpen}},
+        {SYS_CLOSE, {"SYS_CLOSE", &RiscvSemihosting::callClose}},
+        {SYS_WRITEC, {"SYS_WRITEC", &RiscvSemihosting::callWriteC}},
+        {SYS_WRITE0, {"SYS_WRITE0", &RiscvSemihosting::callWrite0}},
+        {SYS_WRITE, {"SYS_WRITE", &RiscvSemihosting::callWrite}},
+        {SYS_READ, {"SYS_READ", &RiscvSemihosting::callRead}},
+        {SYS_READC, {"SYS_READC", &RiscvSemihosting::callReadC}},
+        {SYS_ISERROR, {"SYS_ISERROR", &RiscvSemihosting::callIsError}},
+        {SYS_ISTTY, {"SYS_ISTTY", &RiscvSemihosting::callIsTTY}},
+        {SYS_SEEK, {"SYS_SEEK", &RiscvSemihosting::callSeek}},
+        {SYS_FLEN, {"SYS_FLEN", &RiscvSemihosting::callFLen}},
+        {SYS_TMPNAM, {"SYS_TMPNAM", &RiscvSemihosting::callTmpNam}},
+        {SYS_REMOVE, {"SYS_REMOVE", &RiscvSemihosting::callRemove}},
+        {SYS_RENAME, {"SYS_RENAME", &RiscvSemihosting::callRename}},
+        {SYS_CLOCK, {"SYS_CLOCK", &RiscvSemihosting::callClock}},
+        {SYS_TIME, {"SYS_TIME", &RiscvSemihosting::callTime}},
+        {SYS_SYSTEM, {"SYS_SYSTEM", &RiscvSemihosting::callSystem}},
+        {SYS_ERRNO, {"SYS_ERRNO", &RiscvSemihosting::callErrno}},
+        {SYS_GET_CMDLINE,
+                {"SYS_GET_CMDLINE", &RiscvSemihosting::callGetCmdLine}},
+        {SYS_HEAPINFO, {"SYS_HEAPINFO", &RiscvSemihosting::callHeapInfo32,
+                               &RiscvSemihosting::callHeapInfo64}},
+
+        {SYS_EXIT, {"SYS_EXIT", &RiscvSemihosting::callExit32,
+                           &RiscvSemihosting::callExit64}},
+        {SYS_EXIT_EXTENDED,
+                {"SYS_EXIT_EXTENDED", &RiscvSemihosting::callExitExtended}},
+
+        {SYS_ELAPSED, {"SYS_ELAPSED", &RiscvSemihosting::callElapsed32,
+                              &RiscvSemihosting::callElapsed64}},
+        {SYS_TICKFREQ, {"SYS_TICKFREQ", &RiscvSemihosting::callTickFreq}},
+};
+
+RiscvSemihosting::
+RiscvSemihosting(const RiscvSemihostingParams &p) : BaseSemihosting(p)
+{}
+
+bool
+RiscvSemihosting::call64(ThreadContext *tc)
+{
+    RegVal op = tc->getReg(RiscvISA::int_reg::A0) & mask(32);
+    auto it = calls.find(op);
+    if (it == calls.end()) {
+        unrecognizedCall<Abi64>(tc, "Unknown semihosting call: op = 0x%x", op);
+        return false;
+    }
+    const SemiCall &call = it->second;
+
+    DPRINTF(Semihosting, "Semihosting call64: %s\n", call.dump64(tc));
+    auto err = call.call64(this, tc);
+    semiErrno = err.second;
+    DPRINTF(Semihosting, "\t ->: 0x%x, %i\n", err.first, err.second);
+
+    return true;
+}
+
+bool
+RiscvSemihosting::call32(ThreadContext *tc)
+{
+    RegVal op = tc->getReg(RiscvISA::int_reg::A0);
+    auto it = calls.find(op);
+    if (it == calls.end()) {
+        unrecognizedCall<Abi32>(
+                tc, "Unknown aarch32 semihosting call: op = 0x%x", op);
+        return false;
+    }
+    const SemiCall &call = it->second;
+
+    DPRINTF(Semihosting, "Semihosting call32: %s\n", call.dump32(tc));
+    auto err = call.call32(this, tc);
+    semiErrno = err.second;
+    DPRINTF(Semihosting, "\t ->: 0x%x, %i\n", err.first, err.second);
+
+    return true;
+}
+
+bool
+RiscvSemihosting::call(ThreadContext *tc)
+{
+    auto isa = dynamic_cast<RiscvISA::ISA *>(tc->getIsaPtr());
+    panic_if(!isa, "Cannot derive rv_type from non-riscv isa");
+    return isa->rvType() == enums::RV32 ? call32(tc) : call64(tc);
+}
+
+PortProxy &
+RiscvSemihosting::portProxyImpl(ThreadContext *tc)
+{
+    static std::unique_ptr<PortProxy> port_proxy([=]() {
+        return FullSystem ? new TranslatingPortProxy(tc) :
+                            new SETranslatingPortProxy(tc);
+    }());
+    return *port_proxy;
+}
+
+bool
+RiscvSemihosting::isSemihostingEBreak(ExecContext *xc)
+{
+    // Check if the surrounding bytes match the semihosting magic sequence.
+    PortProxy& proxy = portProxyImpl(xc->tcBase());
+    Addr PrevInstAddr = xc->pcState().instAddr() - 4;
+    Addr NextInstAddr = xc->pcState().instAddr() + 4;
+    if (roundDown(PrevInstAddr, RiscvISA::PageBytes) !=
+        roundDown(NextInstAddr, RiscvISA::PageBytes)) {
+      DPRINTF(Semihosting,
+              "Ebreak cannot be a semihosting ebreak since previous "
+              "and next instruction are on different pages\n");
+      return false;
+    }
+    uint32_t instSequence[3];
+    if (!proxy.tryReadBlob(PrevInstAddr, instSequence, sizeof(instSequence))) {
+        DPRINTF(Semihosting,
+                "Ebreak cannot be a semihosting ebreak since surrounding "
+                "instructions at %#x cannot be accessed\n");
+        return false;
+    }
+    uint32_t PrevInst = gtoh(instSequence[0], ByteOrder::little);
+    uint32_t EBreakInst = gtoh(instSequence[1], ByteOrder::little);
+    uint32_t NextInst = gtoh(instSequence[2], ByteOrder::little);
+    DPRINTF(Semihosting,
+            "Checking ebreak for semihosting: Prev=%#x EBreak=%#x Next=%#x\n",
+            PrevInst, EBreakInst, NextInst);
+    return PrevInst == (uint32_t)Opcode::Prefix &&
+           EBreakInst == (uint32_t)Opcode::EBreak &&
+           NextInst == (uint32_t)Opcode::Suffix;
+}
+
+} // namespace gem5
diff --git a/src/arch/riscv/semihosting.hh b/src/arch/riscv/semihosting.hh
new file mode 100644
index 0000000000..d427936f9f
--- /dev/null
+++ b/src/arch/riscv/semihosting.hh
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2018, 2019 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_RISCV_SEMIHOSTING_HH__
+#define __ARCH_RISCV_SEMIHOSTING_HH__
+
+#include "arch/generic/semihosting.hh"
+#include "arch/riscv/isa.hh"
+#include "arch/riscv/regs/int.hh"
+#include "cpu/thread_context.hh"
+#include "sim/guest_abi.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+struct RiscvSemihostingParams;
+class SerialDevice;
+
+/** Semihosting for RV32 and RV64. */
+class RiscvSemihosting : public BaseSemihosting
+{
+  public:
+    enum class Opcode : uint32_t
+    {
+        // https://github.com/riscv-software-src/riscv-semihosting/blob/main/
+        // riscv-semihosting-spec.adoc#21-semihosting-trap-instruction-sequence
+        Prefix = 0x01f01013, // slli x0, x0, 0x1f Entry NOP
+        EBreak = 0x00100073, // ebreak            Break to debugger
+        Suffix = 0x40705013, // srai x0, x0, 7    NOP encoding semihosting
+    };
+    static PortProxy &portProxyImpl(ThreadContext *tc);
+    PortProxy &
+    portProxy(ThreadContext *tc) const override
+    {
+        return portProxyImpl(tc);
+    }
+    ByteOrder
+    byteOrder(ThreadContext *tc) const override
+    {
+        return ByteOrder::little;
+    }
+
+    template <typename ArgType>
+    struct RiscvSemihostingAbi : public AbiBase
+    {
+        using UintPtr = ArgType;
+
+        class State : public StateBase<ArgType, RiscvSemihosting>
+        {
+          public:
+            explicit
+            State(const ThreadContext *tc) :
+                StateBase<ArgType, RiscvSemihosting>(tc,
+                        tc->getReg(RiscvISA::ArgumentRegs[1]),
+                        [](const ThreadContext *) {
+                            return ByteOrder::little;
+                        })
+            {}
+        };
+    };
+
+    struct Abi64 : public RiscvSemihostingAbi<uint64_t>
+    {};
+    struct Abi32 : public RiscvSemihostingAbi<uint32_t>
+    {};
+
+    enum Operation
+    {
+        SYS_OPEN = 0x01,
+        SYS_CLOSE = 0x02,
+        SYS_WRITEC = 0x03,
+        SYS_WRITE0 = 0x04,
+        SYS_WRITE = 0x05,
+        SYS_READ = 0x06,
+        SYS_READC = 0x07,
+        SYS_ISERROR = 0x08,
+        SYS_ISTTY = 0x09,
+        SYS_SEEK = 0x0A,
+        SYS_FLEN = 0x0C,
+        SYS_TMPNAM = 0x0D,
+        SYS_REMOVE = 0x0E,
+        SYS_RENAME = 0x0F,
+        SYS_CLOCK = 0x10,
+        SYS_TIME = 0x11,
+        SYS_SYSTEM = 0x12,
+        SYS_ERRNO = 0x13,
+        SYS_GET_CMDLINE = 0x15,
+        SYS_HEAPINFO = 0x16,
+        SYS_EXIT = 0x18,
+        SYS_EXIT_EXTENDED = 0x20,
+        SYS_ELAPSED = 0x30,
+        SYS_TICKFREQ = 0x31,
+
+        MaxStandardOp = 0xFF,
+
+        SYS_GEM5_PSEUDO_OP = 0x100
+    };
+
+    using SemiCall = SemiCallBase<RiscvSemihosting, Abi32, Abi64>;
+
+    explicit RiscvSemihosting(const RiscvSemihostingParams &p);
+
+    /** Perform a RISC-V Semihosting call */
+    bool isSemihostingEBreak(ExecContext *xc);
+    bool call(ThreadContext *tc);
+  protected:
+    bool call64(ThreadContext *tc);
+    bool call32(ThreadContext *tc);
+    static const std::map<uint32_t, SemiCall> calls;
+};
+
+namespace guest_abi
+{
+
+template <typename Arg>
+struct Argument<RiscvSemihosting::Abi64, Arg,
+        typename std::enable_if_t<std::is_integral_v<Arg>>>
+{
+    static Arg
+    get(ThreadContext *tc, RiscvSemihosting::Abi64::State &state)
+    {
+        return state.get(tc);
+    }
+};
+
+template <typename Arg>
+struct Argument<RiscvSemihosting::Abi32, Arg,
+        typename std::enable_if_t<std::is_integral_v<Arg>>>
+{
+    static Arg
+    get(ThreadContext *tc, RiscvSemihosting::Abi32::State &state)
+    {
+        if (std::is_signed_v<Arg>)
+            return sext<32>(state.get(tc));
+        else
+            return state.get(tc);
+    }
+};
+
+template <typename Abi>
+struct Argument<Abi, RiscvSemihosting::InPlaceArg,
+        typename std::enable_if_t<
+                std::is_base_of_v<RiscvSemihosting::AbiBase, Abi>>>
+{
+    static RiscvSemihosting::InPlaceArg
+    get(ThreadContext *tc, typename Abi::State &state)
+    {
+        return RiscvSemihosting::InPlaceArg(
+                state.getAddr(), sizeof(typename Abi::State::ArgType));
+    }
+};
+
+template <typename Abi>
+struct Result<Abi, RiscvSemihosting::RetErrno>
+{
+    static void
+    store(ThreadContext *tc, const RiscvSemihosting::RetErrno &err)
+    {
+        tc->setReg(RiscvISA::ReturnValueReg, err.first);
+    }
+};
+
+} // namespace guest_abi
+} // namespace gem5
+
+#endif // __ARCH_RISCV_SEMIHOSTING_HH__
diff --git a/src/arch/riscv/tlb.cc b/src/arch/riscv/tlb.cc
index 679806ec8c..f6d23a8480 100644
--- a/src/arch/riscv/tlb.cc
+++ b/src/arch/riscv/tlb.cc
@@ -67,6 +67,12 @@ using namespace RiscvISA;
 static Addr
 buildKey(Addr vpn, uint16_t asid)
 {
+    // Note ASID is 16 bits
+    // The VPN in sv39 is up to 39-12=27 bits
+    // The VPN in sv48 is up to 48-12=36 bits
+    // The VPN in sv57 is up to 57-12=45 bits
+    // So, shifting the ASID into the top 16 bits is safe.
+    assert(bits(vpn, 63, 48) == 0);
     return (static_cast<Addr>(asid) << 48) | vpn;
 }
 
@@ -110,6 +116,12 @@ TLB::lookup(Addr vpn, uint16_t asid, BaseMMU::Mode mode, bool hidden)
 {
     TlbEntry *entry = trie.lookup(buildKey(vpn, asid));
 
+    DPRINTF(TLBVerbose, "lookup(vpn=%#x, asid=%#x, key=%#x): "
+                        "%s ppn=%#x (%#x) %s\n",
+            vpn, asid, buildKey(vpn, asid), entry ? "hit" : "miss",
+            entry ? entry->paddr : 0, entry ? entry->size() : 0,
+            hidden ? "hidden" : "");
+
     if (!hidden) {
         if (entry)
             entry->lruSeq = nextSeq();
@@ -131,9 +143,6 @@ TLB::lookup(Addr vpn, uint16_t asid, BaseMMU::Mode mode, bool hidden)
             else
                 stats.readHits++;
         }
-
-        DPRINTF(TLBVerbose, "lookup(vpn=%#x, asid=%#x): %s ppn %#x\n",
-                vpn, asid, entry ? "hit" : "miss", entry ? entry->paddr : 0);
     }
 
     return entry;
@@ -142,15 +151,19 @@ TLB::lookup(Addr vpn, uint16_t asid, BaseMMU::Mode mode, bool hidden)
 TlbEntry *
 TLB::insert(Addr vpn, const TlbEntry &entry)
 {
-    DPRINTF(TLB, "insert(vpn=%#x, asid=%#x): ppn=%#x pte=%#x size=%#x\n",
-        vpn, entry.asid, entry.paddr, entry.pte, entry.size());
+    DPRINTF(TLB, "insert(vpn=%#x, asid=%#x, key=%#x): "
+                 "vaddr=%#x paddr=%#x pte=%#x size=%#x\n",
+        vpn, entry.asid, buildKey(vpn, entry.asid), entry.vaddr, entry.paddr,
+        entry.pte, entry.size());
 
     // If somebody beat us to it, just use that existing entry.
     TlbEntry *newEntry = lookup(vpn, entry.asid, BaseMMU::Read, true);
     if (newEntry) {
         // update PTE flags (maybe we set the dirty/writable flag)
         newEntry->pte = entry.pte;
-        assert(newEntry->vaddr == vpn);
+        assert(newEntry->vaddr == entry.vaddr);
+        assert(newEntry->asid == entry.asid);
+        assert(newEntry->logBytes == entry.logBytes);
         return newEntry;
     }
 
@@ -163,31 +176,46 @@ TLB::insert(Addr vpn, const TlbEntry &entry)
     Addr key = buildKey(vpn, entry.asid);
     *newEntry = entry;
     newEntry->lruSeq = nextSeq();
-    newEntry->vaddr = vpn;
-    newEntry->trieHandle =
-    trie.insert(key, TlbEntryTrie::MaxBits - entry.logBytes, newEntry);
+    newEntry->trieHandle = trie.insert(
+        key, TlbEntryTrie::MaxBits - entry.logBytes + PageShift, newEntry
+    );
     return newEntry;
 }
 
 void
-TLB::demapPage(Addr vpn, uint64_t asid)
+TLB::demapPage(Addr vaddr, uint64_t asid)
 {
+    // Note: vaddr is Reg[rs1] and asid is Reg[rs2]
+    // The definition of this instruction is
+    // if vaddr=x0 and asid=x0, then flush all
+    // if vaddr=x0 and asid!=x0 then flush all with matching asid
+    // if vaddr!=x0 and asid=x0 then flush all leaf PTEs that match vaddr
+    // if vaddr!=x0 and asid!=x0 then flush the leaf PTE that matches vaddr
+    //    in the given asid.
+    // No effect if vaddr is not valid
+    // Currently, we assume if the values of the registers are 0 then it was
+    // referencing x0.
+
     asid &= 0xFFFF;
 
-    if (vpn == 0 && asid == 0)
+    DPRINTF(TLB, "flush(vaddr=%#x, asid=%#x)\n", vaddr, asid);
+    if (vaddr == 0 && asid == 0) {
+        DPRINTF(TLB, "Flushing all TLB entries\n");
         flushAll();
-    else {
-        DPRINTF(TLB, "flush(vpn=%#x, asid=%#x)\n", vpn, asid);
-        if (vpn != 0 && asid != 0) {
-            TlbEntry *newEntry = lookup(vpn, asid, BaseMMU::Read, true);
-            if (newEntry)
-                remove(newEntry - tlb.data());
+    } else {
+        if (vaddr != 0 && asid != 0) {
+            // TODO: When supporting other address translation modes, fix this
+            Addr vpn = getVPNFromVAddr(vaddr, AddrXlateMode::SV39);
+            TlbEntry *entry = lookup(vpn, asid, BaseMMU::Read, true);
+            if (entry) {
+                remove(entry - tlb.data());
+            }
         }
         else {
             for (size_t i = 0; i < size; i++) {
                 if (tlb[i].trieHandle) {
                     Addr mask = ~(tlb[i].size() - 1);
-                    if ((vpn == 0 || (vpn & mask) == tlb[i].vaddr) &&
+                    if ((vaddr == 0 || (vaddr & mask) == tlb[i].vaddr) &&
                         (asid == 0 || tlb[i].asid == asid))
                         remove(i);
                 }
@@ -267,9 +295,10 @@ TLB::createPagefault(Addr vaddr, BaseMMU::Mode mode)
 }
 
 Addr
-TLB::translateWithTLB(Addr vaddr, uint16_t asid, BaseMMU::Mode mode)
+TLB::translateWithTLB(Addr vaddr, uint16_t asid, Addr xmode,
+                      BaseMMU::Mode mode)
 {
-    TlbEntry *e = lookup(vaddr, asid, mode, false);
+    TlbEntry *e = lookup(getVPNFromVAddr(vaddr, xmode), asid, mode, false);
     assert(e != nullptr);
     return e->paddr << PageShift | (vaddr & mask(e->logBytes));
 }
@@ -284,7 +313,8 @@ TLB::doTranslate(const RequestPtr &req, ThreadContext *tc,
     Addr vaddr = Addr(sext<VADDR_BITS>(req->getVaddr()));
     SATP satp = tc->readMiscReg(MISCREG_SATP);
 
-    TlbEntry *e = lookup(vaddr, satp.asid, mode, false);
+    Addr vpn = getVPNFromVAddr(vaddr, satp.mode);
+    TlbEntry *e = lookup(vpn, satp.asid, mode, false);
     if (!e) {
         Fault fault = walker->start(tc, translation, req, mode);
         if (translation != nullptr || fault != NoFault) {
@@ -292,7 +322,7 @@ TLB::doTranslate(const RequestPtr &req, ThreadContext *tc,
             delayed = true;
             return fault;
         }
-        e = lookup(vaddr, satp.asid, mode, true);
+        e = lookup(vpn, satp.asid, mode, true);
         assert(e != nullptr);
     }
 
@@ -315,8 +345,8 @@ TLB::doTranslate(const RequestPtr &req, ThreadContext *tc,
     }
 
     Addr paddr = e->paddr << PageShift | (vaddr & mask(e->logBytes));
-    DPRINTF(TLBVerbose, "translate(vpn=%#x, asid=%#x): %#x\n",
-            vaddr, satp.asid, paddr);
+    DPRINTF(TLBVerbose, "translate(vaddr=%#x, vpn=%#x, asid=%#x): %#x\n",
+            vaddr, vpn, satp.asid, paddr);
     req->setPaddr(paddr);
 
     return NoFault;
@@ -343,45 +373,36 @@ TLB::translate(const RequestPtr &req, ThreadContext *tc,
         PrivilegeMode pmode = getMemPriv(tc, mode);
         MISA misa = tc->readMiscRegNoEffect(MISCREG_ISA);
         SATP satp = tc->readMiscReg(MISCREG_SATP);
+        Fault fault = NoFault;
+
+        fault = pma->checkVAddrAlignment(req, mode);
+
         if (!misa.rvs || pmode == PrivilegeMode::PRV_M ||
             satp.mode == AddrXlateMode::BARE) {
             req->setFlags(Request::PHYSICAL);
         }
 
-        Fault fault;
-        if (req->getFlags() & Request::PHYSICAL) {
-            /**
-             * we simply set the virtual address to physical address
-             */
-            req->setPaddr(req->getVaddr());
-            fault = NoFault;
-        } else {
-            fault = doTranslate(req, tc, translation, mode, delayed);
-        }
-
-        // according to the RISC-V tests, negative physical addresses trigger
-        // an illegal address exception.
-        // TODO where is that written in the manual?
-        if (!delayed && fault == NoFault && bits(req->getPaddr(), 63)) {
-            ExceptionCode code;
-            if (mode == BaseMMU::Read)
-                code = ExceptionCode::LOAD_ACCESS;
-            else if (mode == BaseMMU::Write)
-                code = ExceptionCode::STORE_ACCESS;
-            else
-                code = ExceptionCode::INST_ACCESS;
-            fault = std::make_shared<AddressFault>(req->getVaddr(), code);
+        if (fault == NoFault) {
+            if (req->getFlags() & Request::PHYSICAL) {
+                /**
+                 * we simply set the virtual address to physical address
+                 */
+                req->setPaddr(req->getVaddr());
+            } else {
+                fault = doTranslate(req, tc, translation, mode, delayed);
+            }
         }
 
         if (!delayed && fault == NoFault) {
-            pma->check(req);
-
             // do pmp check if any checking condition is met.
             // timingFault will be NoFault if pmp checks are
             // passed, otherwise an address fault will be returned.
             fault = pmp->pmpCheck(req, mode, pmode, tc);
         }
 
+        if (!delayed && fault == NoFault) {
+            fault = pma->check(req, mode);
+        }
         return fault;
     } else {
         // In the O3 CPU model, sometimes a memory access will be speculatively
@@ -514,9 +535,11 @@ TLB::unserialize(CheckpointIn &cp)
         freeList.pop_front();
 
         newEntry->unserializeSection(cp, csprintf("Entry%d", x));
-        Addr key = buildKey(newEntry->vaddr, newEntry->asid);
+        // TODO: When supporting other addressing modes fix this
+        Addr vpn = getVPNFromVAddr(newEntry->vaddr, AddrXlateMode::SV39);
+        Addr key = buildKey(vpn, newEntry->asid);
         newEntry->trieHandle = trie.insert(key,
-            TlbEntryTrie::MaxBits - newEntry->logBytes, newEntry);
+            TlbEntryTrie::MaxBits - newEntry->logBytes + PageShift, newEntry);
     }
 }
 
diff --git a/src/arch/riscv/tlb.hh b/src/arch/riscv/tlb.hh
index f37143dfb7..afe2c90db8 100644
--- a/src/arch/riscv/tlb.hh
+++ b/src/arch/riscv/tlb.hh
@@ -86,7 +86,7 @@ class TLB : public BaseTLB
     } stats;
 
   public:
-    PMAChecker *pma;
+    BasePMAChecker *pma;
     PMP *pmp;
 
   public:
@@ -97,6 +97,14 @@ class TLB : public BaseTLB
 
     void takeOverFrom(BaseTLB *old) override {}
 
+    /**
+     * Insert an entry into the TLB.
+     * @param vpn The virtual page number extracted from the address.
+     *            It is shifted based on the page size. We assume the
+     *            smallest defined page size and remove the upper bits of the
+     *            virtual address that are not part of the page number.
+     * @param entry The entry to insert.
+     */
     TlbEntry *insert(Addr vpn, const TlbEntry &entry);
     void flushAll() override;
     void demapPage(Addr vaddr, uint64_t asn) override;
@@ -123,7 +131,8 @@ class TLB : public BaseTLB
      */
     Port *getTableWalkerPort() override;
 
-    Addr translateWithTLB(Addr vaddr, uint16_t asid, BaseMMU::Mode mode);
+    Addr translateWithTLB(Addr vaddr, uint16_t asid, Addr xmode,
+                          BaseMMU::Mode mode);
 
     Fault translateAtomic(const RequestPtr &req,
                           ThreadContext *tc, BaseMMU::Mode mode) override;
@@ -135,11 +144,21 @@ class TLB : public BaseTLB
     Fault finalizePhysical(const RequestPtr &req, ThreadContext *tc,
                            BaseMMU::Mode mode) const override;
 
+    /**
+     * Perform the tlb lookup
+     * @param vpn The virtual page number extracted from the address.
+     *            It is shifted based on the page size. We assume the
+     *            smallest defined page size and remove the upper bits of the
+     *            virtual address that are not part of the page number.
+     * @param asid The address space identifier as specified by satp.
+     * @param mode The mode of the memory operation.
+     * @param hidden If the lookup should be hidden from the statistics.
+     */
+    TlbEntry *lookup(Addr vpn, uint16_t asid, BaseMMU::Mode mode, bool hidden);
+
   private:
     uint64_t nextSeq() { return ++lruSeq; }
 
-    TlbEntry *lookup(Addr vpn, uint16_t asid, BaseMMU::Mode mode, bool hidden);
-
     void evictLRU();
     void remove(size_t idx);
 
diff --git a/src/arch/riscv/utility.hh b/src/arch/riscv/utility.hh
index bac499e523..c6819d8bd7 100644
--- a/src/arch/riscv/utility.hh
+++ b/src/arch/riscv/utility.hh
@@ -65,6 +65,27 @@ namespace gem5
 namespace RiscvISA
 {
 
+template<typename Type> struct double_width;
+template<> struct double_width<uint8_t>     { using type = uint16_t;};
+template<> struct double_width<uint16_t>    { using type = uint32_t;};
+template<> struct double_width<uint32_t>    { using type = uint64_t;};
+template<> struct double_width<uint64_t>    { using type = __uint128_t;};
+template<> struct double_width<int8_t>      { using type = int16_t; };
+template<> struct double_width<int16_t>     { using type = int32_t; };
+template<> struct double_width<int32_t>     { using type = int64_t; };
+template<> struct double_width<int64_t>     { using type = __int128_t; };
+template<> struct double_width<float32_t>   { using type = float64_t;};
+template<> struct double_width<float16_t>   { using type = float32_t;};
+template<> struct double_width<float8_t>    { using type = float16_t;};
+
+template<typename Type> struct double_widthf;
+template<> struct double_widthf<uint32_t>    { using type = float64_t;};
+template<> struct double_widthf<int32_t>     { using type = float64_t;};
+template<> struct double_widthf<uint16_t>    { using type = float32_t;};
+template<> struct double_widthf<int16_t>     { using type = float32_t;};
+template<> struct double_widthf<uint8_t>     { using type = float16_t;};
+template<> struct double_widthf<int8_t>      { using type = float16_t;};
+
 template<typename T> inline bool
 isquietnan(T val)
 {
@@ -146,57 +167,25 @@ registerName(RegId reg)
     }
 }
 
-inline uint32_t
-mulhu_32(uint32_t rs1, uint32_t rs2)
+template <typename T> inline std::make_unsigned_t<T>
+mulhu(std::make_unsigned_t<T> rs1, std::make_unsigned_t<T> rs2)
 {
-    return ((uint64_t)rs1 * rs2) >> 32;
+    using WideT = typename double_width<std::make_unsigned_t<T>>::type;
+    return ((WideT)rs1 * rs2) >> (sizeof(T) * 8);
 }
 
-inline uint64_t
-mulhu_64(uint64_t rs1, uint64_t rs2)
+template <typename T> inline std::make_signed_t<T>
+mulh(std::make_signed_t<T> rs1, std::make_signed_t<T> rs2)
 {
-    uint64_t rs1_lo = (uint32_t)rs1;
-    uint64_t rs1_hi = rs1 >> 32;
-    uint64_t rs2_lo = (uint32_t)rs2;
-    uint64_t rs2_hi = rs2 >> 32;
-
-    uint64_t hi = rs1_hi * rs2_hi;
-    uint64_t mid1 = rs1_hi * rs2_lo;
-    uint64_t mid2 = rs1_lo * rs2_hi;
-    uint64_t lo = rs1_lo * rs2_lo;
-    uint64_t carry = ((uint64_t)(uint32_t)mid1
-            + (uint64_t)(uint32_t)mid2
-            + (lo >> 32)) >> 32;
-
-    return hi + (mid1 >> 32) + (mid2 >> 32) + carry;
+    using WideT = typename double_width<std::make_signed_t<T>>::type;
+    return ((WideT)rs1 * rs2) >> (sizeof(T) * 8);
 }
 
-inline int32_t
-mulh_32(int32_t rs1, int32_t rs2)
+template <typename T> inline std::make_signed_t<T>
+mulhsu(std::make_signed_t<T> rs1, std::make_unsigned_t<T> rs2)
 {
-    return ((int64_t)rs1 * rs2) >> 32;
-}
-
-inline int64_t
-mulh_64(int64_t rs1, int64_t rs2)
-{
-    bool negate = (rs1 < 0) != (rs2 < 0);
-    uint64_t res = mulhu_64(std::abs(rs1), std::abs(rs2));
-    return negate ? ~res + (rs1 * rs2 == 0 ? 1 : 0) : res;
-}
-
-inline int32_t
-mulhsu_32(int32_t rs1, uint32_t rs2)
-{
-    return ((int64_t)rs1 * rs2) >> 32;
-}
-
-inline int64_t
-mulhsu_64(int64_t rs1, uint64_t rs2)
-{
-    bool negate = rs1 < 0;
-    uint64_t res = mulhu_64(std::abs(rs1), rs2);
-    return negate ? ~res + (rs1 * rs2 == 0 ? 1 : 0) : res;
+    using WideT = typename double_width<std::make_signed_t<T>>::type;
+    return ((WideT)rs1 * rs2) >> (sizeof(T) * 8);
 }
 
 template<typename T> inline T
@@ -323,18 +312,16 @@ elem_mask(const T* vs, const int index)
     return (vs[idx] >> pos) & 1;
 }
 
-template<typename Type> struct double_width;
-template<> struct double_width<uint8_t>     { using type = uint16_t;};
-template<> struct double_width<uint16_t>    { using type = uint32_t;};
-template<> struct double_width<uint32_t>    { using type = uint64_t;};
-template<> struct double_width<int8_t>      { using type = int16_t; };
-template<> struct double_width<int16_t>     { using type = int32_t; };
-template<> struct double_width<int32_t>     { using type = int64_t; };
-template<> struct double_width<float32_t>   { using type = float64_t;};
-
-template<typename Type> struct double_widthf;
-template<> struct double_widthf<uint32_t>    { using type = float64_t;};
-template<> struct double_widthf<int32_t>     { using type = float64_t;};
+template<typename T>
+inline int
+elem_mask_vseg(const T* vs, const int elem, const int num_fields)
+{
+    int index = floor(elem / num_fields);
+    static_assert(std::is_integral_v<T>);
+    int idx = index / (sizeof(T)*8);
+    int pos = index % (sizeof(T)*8);
+    return (vs[idx] >> pos) & 1;
+}
 
 template<typename FloatType, typename IntType = decltype(FloatType::v)> auto
 ftype(IntType a) -> FloatType
@@ -343,6 +330,8 @@ ftype(IntType a) -> FloatType
         return f32(a);
     else if constexpr(std::is_same_v<uint64_t, IntType>)
         return f64(a);
+    else if constexpr(std::is_same_v<uint16_t, IntType>)
+        return f16(a);
     GEM5_UNREACHABLE;
 }
 
@@ -355,6 +344,8 @@ ftype_freg(freg_t a) -> FloatType
         return f32(a);
     else if constexpr(std::is_same_v<uint64_t, IntType>)
         return f64(a);
+    else if constexpr(std::is_same_v<uint16_t, IntType>)
+        return f16(a);
     GEM5_UNREACHABLE;
 }
 
@@ -365,6 +356,8 @@ fadd(FloatType a, FloatType b)
         return f32_add(a, b);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_add(a, b);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_add(a, b);
     GEM5_UNREACHABLE;
 }
 
@@ -375,6 +368,8 @@ fsub(FloatType a, FloatType b)
         return f32_sub(a, b);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_sub(a, b);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_sub(a, b);
     GEM5_UNREACHABLE;
 }
 
@@ -385,6 +380,8 @@ fmin(FloatType a, FloatType b)
         return f32_min(a, b);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_min(a, b);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_min(a, b);
     GEM5_UNREACHABLE;
 }
 
@@ -395,6 +392,8 @@ fmax(FloatType a, FloatType b)
         return f32_max(a, b);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_max(a, b);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_max(a, b);
     GEM5_UNREACHABLE;
 }
 
@@ -405,6 +404,8 @@ fdiv(FloatType a, FloatType b)
         return f32_div(a, b);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_div(a, b);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_div(a, b);
     GEM5_UNREACHABLE;
 }
 
@@ -415,6 +416,8 @@ fmul(FloatType a, FloatType b)
         return f32_mul(a, b);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_mul(a, b);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_mul(a, b);
     GEM5_UNREACHABLE;
 }
 
@@ -425,6 +428,8 @@ fsqrt(FloatType a)
         return f32_sqrt(a);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_sqrt(a);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_sqrt(a);
     GEM5_UNREACHABLE;
 }
 
@@ -435,6 +440,8 @@ frsqrte7(FloatType a)
         return f32_rsqrte7(a);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_rsqrte7(a);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_rsqrte7(a);
     GEM5_UNREACHABLE;
 }
 
@@ -445,6 +452,8 @@ frecip7(FloatType a)
         return f32_recip7(a);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_recip7(a);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_recip7(a);
     GEM5_UNREACHABLE;
 }
 
@@ -455,6 +464,8 @@ fclassify(FloatType a)
         return f32(f32_classify(a));
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64(f64_classify(a));
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16(f16_classify(a));
     GEM5_UNREACHABLE;
 }
 
@@ -465,6 +476,8 @@ fsgnj(FloatType a, FloatType b, bool n, bool x)
         return fsgnj32(a, b, n, x);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return fsgnj64(a, b, n, x);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return fsgnj16(a, b, n, x);
     GEM5_UNREACHABLE;
 }
 
@@ -475,6 +488,8 @@ fle(FloatType a, FloatType b)
         return f32_le(a, b);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_le(a, b);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_le(a, b);
     GEM5_UNREACHABLE;
 }
 
@@ -485,6 +500,8 @@ feq(FloatType a, FloatType b)
         return f32_eq(a, b);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_eq(a, b);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_eq(a, b);
     GEM5_UNREACHABLE;
 }
 
@@ -495,6 +512,8 @@ flt(FloatType a, FloatType b)
         return f32_lt(a, b);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_lt(a, b);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_lt(a, b);
     GEM5_UNREACHABLE;
 }
 
@@ -505,6 +524,8 @@ fmadd(FloatType a, FloatType b, FloatType c)
         return f32_mulAdd(a, b, c);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_mulAdd(a, b, c);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_mulAdd(a, b, c);
     GEM5_UNREACHABLE;
 }
 
@@ -515,6 +536,8 @@ fneg(FloatType a)
         return f32(a.v ^ uint32_t(mask(31, 31)));
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64(a.v ^ mask(63, 63));
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16(a.v ^ uint16_t(mask(15, 15)));
     GEM5_UNREACHABLE;
 }
 
@@ -523,6 +546,8 @@ fwiden(FT a)
 {
     if constexpr(std::is_same_v<float32_t, FT>)
         return f32_to_f64(a);
+    else if constexpr(std::is_same_v<float16_t, FT>)
+        return f16_to_f32(a);
     GEM5_UNREACHABLE;
 }
 
@@ -533,6 +558,8 @@ f_to_ui(FloatType a, uint_fast8_t mode)
         return f32_to_ui32(a, mode, true);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_to_ui64(a, mode, true);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_to_ui16(a, mode, true);
     GEM5_UNREACHABLE;
 }
 
@@ -544,6 +571,8 @@ f_to_wui(FloatType a, uint_fast8_t mode)
 {
     if constexpr(std::is_same_v<float32_t, FloatType>)
         return f32_to_ui64(a, mode, true);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_to_ui32(a, mode, true);
     GEM5_UNREACHABLE;
 }
 
@@ -555,6 +584,10 @@ f_to_nui(FloatType a, uint_fast8_t mode)
 {
     if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_to_ui32(a, mode, true);
+    else if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_to_ui16(a, mode, true);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_to_ui8(a, mode, true);
     GEM5_UNREACHABLE;
 }
 
@@ -565,6 +598,8 @@ f_to_i(FloatType a, uint_fast8_t mode)
         return (uint32_t)f32_to_i32(a, mode, true);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return (uint64_t)f64_to_i64(a, mode, true);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return (uint16_t)f16_to_i16(a, mode, true);
     GEM5_UNREACHABLE;
 }
 
@@ -576,6 +611,8 @@ f_to_wi(FloatType a, uint_fast8_t mode)
 {
     if constexpr(std::is_same_v<float32_t, FloatType>)
         return (uint64_t)f32_to_i64(a, mode, true);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return (uint32_t)f16_to_i32(a, mode, true);
     GEM5_UNREACHABLE;
 }
 
@@ -587,6 +624,10 @@ f_to_ni(FloatType a, uint_fast8_t mode)
 {
     if constexpr(std::is_same_v<float64_t, FloatType>)
         return (uint32_t)f64_to_i32(a, mode, true);
+    else if constexpr(std::is_same_v<float32_t, FloatType>)
+        return (uint16_t)f32_to_i16(a, mode, true);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return (uint8_t)f16_to_i8(a, mode, true);
     GEM5_UNREACHABLE;
 }
 
@@ -598,6 +639,8 @@ ui_to_f(IntType a)
         return ui32_to_f32(a);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return ui64_to_f64(a);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return ui32_to_f16(a);
     GEM5_UNREACHABLE;
 }
 
@@ -609,6 +652,10 @@ ui_to_wf(IntType a)
 {
     if constexpr(std::is_same_v<float64_t, FloatType>)
         return ui32_to_f64(a);
+    else if constexpr(std::is_same_v<float32_t, FloatType>)
+        return ui32_to_f32(a);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return ui32_to_f16(a);
     GEM5_UNREACHABLE;
 }
 
@@ -620,6 +667,8 @@ ui_to_nf(IntType a)
 {
     if constexpr(std::is_same_v<float32_t, FloatType>)
         return ui64_to_f32(a);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return ui32_to_f16(a);
     GEM5_UNREACHABLE;
 }
 
@@ -631,6 +680,8 @@ i_to_f(IntType a)
         return i32_to_f32((int32_t)a);
     else if constexpr(std::is_same_v<float64_t, FloatType>)
         return i64_to_f64((int64_t)a);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return i32_to_f16((int16_t)a);
     GEM5_UNREACHABLE;
 }
 
@@ -642,6 +693,10 @@ i_to_wf(IntType a)
 {
     if constexpr(std::is_same_v<float64_t, FloatType>)
         return i32_to_f64((int32_t)a);
+    else if constexpr(std::is_same_v<float32_t, FloatType>)
+        return i32_to_f32((int16_t)a);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return i32_to_f16((int8_t)a);
     GEM5_UNREACHABLE;
 }
 
@@ -655,6 +710,8 @@ i_to_nf(IntType a)
 {
     if constexpr(std::is_same_v<float32_t, FloatType>)
         return i64_to_f32(a);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return i32_to_f16(a);
     GEM5_UNREACHABLE;
 }
 
@@ -666,6 +723,8 @@ f_to_wf(FloatType a)
 {
     if constexpr(std::is_same_v<float32_t, FloatType>)
         return f32_to_f64(a);
+    else if constexpr(std::is_same_v<float16_t, FloatType>)
+        return f16_to_f32(a);
     GEM5_UNREACHABLE;
 }
 
@@ -677,6 +736,8 @@ f_to_nf(FloatType a)
 {
     if constexpr(std::is_same_v<float64_t, FloatType>)
         return f64_to_f32(a);
+    else if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_to_f16(a);
     GEM5_UNREACHABLE;
 }
 
diff --git a/src/arch/sparc/insts/unimp.hh b/src/arch/sparc/insts/unimp.hh
index 9eda012182..8609823382 100644
--- a/src/arch/sparc/insts/unimp.hh
+++ b/src/arch/sparc/insts/unimp.hh
@@ -59,7 +59,9 @@ class FailUnimplemented : public SparcStaticInst
     /// Constructor
     FailUnimplemented(const char *_mnemonic, ExtMachInst _machInst) :
             SparcStaticInst(_mnemonic, _machInst, No_OpClass)
-    {}
+    {
+        flags[IsInvalid] = true;
+    }
 
     Fault
     execute(ExecContext *xc, trace::InstRecord *traceData) const override
diff --git a/src/arch/sparc/insts/unknown.hh b/src/arch/sparc/insts/unknown.hh
index f4bb143198..f5e4b70d43 100644
--- a/src/arch/sparc/insts/unknown.hh
+++ b/src/arch/sparc/insts/unknown.hh
@@ -47,7 +47,9 @@ class Unknown : public SparcStaticInst
     // Constructor
     Unknown(ExtMachInst _machInst) :
             SparcStaticInst("unknown", _machInst, No_OpClass)
-    {}
+    {
+        flags[IsInvalid] = true;
+    }
 
     Fault
     execute(ExecContext *, trace::InstRecord *) const override
diff --git a/src/arch/sparc/isa.cc b/src/arch/sparc/isa.cc
index e7807c2b0a..18e7a5390d 100644
--- a/src/arch/sparc/isa.cc
+++ b/src/arch/sparc/isa.cc
@@ -74,12 +74,12 @@ RegClass vecRegClass(VecRegClass, VecRegClassName, 1, debug::IntRegs);
 RegClass vecElemClass(VecElemClass, VecElemClassName, 2, debug::IntRegs);
 RegClass vecPredRegClass(VecPredRegClass, VecPredRegClassName, 1,
         debug::IntRegs);
-RegClass matRegClass(MatRegClass, MatRegClassName, 1, debug::MatRegs);
+RegClass matRegClass(MatRegClass, MatRegClassName, 0, debug::MatRegs);
 RegClass ccRegClass(CCRegClass, CCRegClassName, 0, debug::IntRegs);
 
 } // anonymous namespace
 
-ISA::ISA(const Params &p) : BaseISA(p)
+ISA::ISA(const Params &p) : BaseISA(p, "sparc")
 {
     _regClasses.push_back(&flatIntRegClass);
     _regClasses.push_back(&floatRegClass);
@@ -844,6 +844,8 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
 void
 ISA::serialize(CheckpointOut &cp) const
 {
+    BaseISA::serialize(cp);
+
     SERIALIZE_SCALAR(asi);
     SERIALIZE_SCALAR(tick);
     SERIALIZE_SCALAR(fprs);
diff --git a/src/arch/sparc/pseudo_inst_abi.hh b/src/arch/sparc/pseudo_inst_abi.hh
index 989f0e7dfc..569cfb8da8 100644
--- a/src/arch/sparc/pseudo_inst_abi.hh
+++ b/src/arch/sparc/pseudo_inst_abi.hh
@@ -31,6 +31,7 @@
 #include "arch/sparc/regs/int.hh"
 #include "cpu/thread_context.hh"
 #include "sim/guest_abi.hh"
+#include "sim/pseudo_inst.hh"
 
 namespace gem5
 {
@@ -67,6 +68,19 @@ struct Argument<SparcPseudoInstABI, uint64_t>
     }
 };
 
+template <>
+struct Argument<SparcPseudoInstABI, pseudo_inst::GuestAddr>
+{
+    using Arg = pseudo_inst::GuestAddr;
+
+    static Arg
+    get(ThreadContext *tc, SparcPseudoInstABI::State &state)
+    {
+        panic_if(state >= 6, "Too many psuedo inst arguments.");
+        return (Arg)tc->getReg(SparcISA::int_reg::o(state++));
+    }
+};
+
 } // namespace guest_abi
 } // namespace gem5
 
diff --git a/src/arch/x86/fs_workload.cc b/src/arch/x86/fs_workload.cc
index 88d7deed68..1ac5d439d1 100644
--- a/src/arch/x86/fs_workload.cc
+++ b/src/arch/x86/fs_workload.cc
@@ -302,6 +302,11 @@ FsWorkload::initState()
     // Point to the page tables.
     tc->setMiscReg(misc_reg::Cr3, PageMapLevel4);
 
+    // Only used if cr4.osxsave is set
+    XCR0 xcr0 = tc->readMiscRegNoEffect(misc_reg::Xcr0);
+    xcr0.x87 = 1; // Must be 1 according to x86 specification
+    tc->setMiscReg(misc_reg::Xcr0, xcr0);
+
     Efer efer = tc->readMiscRegNoEffect(misc_reg::Efer);
     // Enable long mode.
     efer.lme = 1;
diff --git a/src/arch/x86/isa.cc b/src/arch/x86/isa.cc
index 7d401a6c59..89b4328d66 100644
--- a/src/arch/x86/isa.cc
+++ b/src/arch/x86/isa.cc
@@ -147,12 +147,12 @@ RegClass vecRegClass(VecRegClass, VecRegClassName, 1, debug::IntRegs);
 RegClass vecElemClass(VecElemClass, VecElemClassName, 2, debug::IntRegs);
 RegClass vecPredRegClass(VecPredRegClass, VecPredRegClassName, 1,
         debug::IntRegs);
-RegClass matRegClass(MatRegClass, MatRegClassName, 1, debug::MatRegs);
+RegClass matRegClass(MatRegClass, MatRegClassName, 0, debug::MatRegs);
 
 } // anonymous namespace
 
 ISA::ISA(const X86ISAParams &p)
-    : BaseISA(p), cpuid(new X86CPUID(p.vendor_string, p.name_string))
+    : BaseISA(p, "x86"), cpuid(new X86CPUID(p.vendor_string, p.name_string))
 {
     cpuid->addStandardFunc(FamilyModelStepping, p.FamilyModelStepping);
     cpuid->addStandardFunc(CacheParams, p.CacheParams);
@@ -244,6 +244,10 @@ ISA::readMiscReg(RegIndex idx)
         return base;
     }
 
+    if (idx == misc_reg::Xcr0) {
+        return regVal[idx] | 1;
+    }
+
     return readMiscRegNoEffect(idx);
 }
 
@@ -338,6 +342,8 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
         break;
       case misc_reg::Cr8:
         break;
+      case misc_reg::Xcr0:
+        break;
       case misc_reg::Rflags:
         {
             RFLAGS rflags = val;
@@ -491,6 +497,8 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
 void
 ISA::serialize(CheckpointOut &cp) const
 {
+    BaseISA::serialize(cp);
+
     SERIALIZE_ARRAY(regVal, misc_reg::NumRegs);
 }
 
diff --git a/src/arch/x86/isa/formats/unimp.isa b/src/arch/x86/isa/formats/unimp.isa
index 2950f55891..e43ed00764 100644
--- a/src/arch/x86/isa/formats/unimp.isa
+++ b/src/arch/x86/isa/formats/unimp.isa
@@ -58,6 +58,7 @@ output header {{
             // don't call execute() (which panics) if we're on a
             // speculative path
             flags[IsNonSpeculative] = true;
+            flags[IsInvalid] = true;
         }
 
         Fault execute(ExecContext *, trace::InstRecord *) const override;
diff --git a/src/arch/x86/isa/formats/unknown.isa b/src/arch/x86/isa/formats/unknown.isa
index eca297bab2..d7bca54cd1 100644
--- a/src/arch/x86/isa/formats/unknown.isa
+++ b/src/arch/x86/isa/formats/unknown.isa
@@ -53,6 +53,7 @@ output header {{
         Unknown(ExtMachInst _machInst) :
                 X86ISA::X86StaticInst("unknown", _machInst, No_OpClass)
         {
+            flags[IsInvalid] = true;
         }
 
         Fault execute(ExecContext *, trace::InstRecord *) const override;
diff --git a/src/arch/x86/isa/insts/general_purpose/compare_and_test/test.py b/src/arch/x86/isa/insts/general_purpose/compare_and_test/test.py
index 87e8fd3f8f..8bff1e3cca 100644
--- a/src/arch/x86/isa/insts/general_purpose/compare_and_test/test.py
+++ b/src/arch/x86/isa/insts/general_purpose/compare_and_test/test.py
@@ -37,26 +37,26 @@ microcode = """
 def macroop TEST_M_R
 {
     ld t1, seg, sib, disp
-    and t0, t1, reg, flags=(OF, SF, ZF, PF, CF)
+    and t0, t1, reg, flags=(OF, SF, ZF, PF, CF, AF)
 };
 
 def macroop TEST_P_R
 {
     rdip t7
     ld t1, seg, riprel, disp
-    and t0, t1, reg, flags=(OF, SF, ZF, PF, CF)
+    and t0, t1, reg, flags=(OF, SF, ZF, PF, CF, AF)
 };
 
 def macroop TEST_R_R
 {
-    and t0, reg, regm, flags=(OF, SF, ZF, PF, CF)
+    and t0, reg, regm, flags=(OF, SF, ZF, PF, CF, AF)
 };
 
 def macroop TEST_M_I
 {
     ld t1, seg, sib, disp
     limm t2, imm
-    and t0, t1, t2, flags=(OF, SF, ZF, PF, CF)
+    and t0, t1, t2, flags=(OF, SF, ZF, PF, CF, AF)
 };
 
 def macroop TEST_P_I
@@ -64,12 +64,12 @@ def macroop TEST_P_I
     rdip t7
     ld t1, seg, riprel, disp
     limm t2, imm
-    and t0, t1, t2, flags=(OF, SF, ZF, PF, CF)
+    and t0, t1, t2, flags=(OF, SF, ZF, PF, CF, AF)
 };
 
 def macroop TEST_R_I
 {
     limm t1, imm
-    and t0, reg, t1, flags=(OF, SF, ZF, PF, CF)
+    and t0, reg, t1, flags=(OF, SF, ZF, PF, CF, AF)
 };
 """
diff --git a/src/arch/x86/isa/insts/general_purpose/logical.py b/src/arch/x86/isa/insts/general_purpose/logical.py
index 85c70ac707..515ad0b4e0 100644
--- a/src/arch/x86/isa/insts/general_purpose/logical.py
+++ b/src/arch/x86/isa/insts/general_purpose/logical.py
@@ -36,14 +36,14 @@
 microcode = """
 def macroop OR_R_R
 {
-    or reg, reg, regm, flags=(OF,SF,ZF,PF,CF)
+    or reg, reg, regm, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop OR_M_I
 {
     limm t2, imm
     ldst t1, seg, sib, disp
-    or t1, t1, t2, flags=(OF,SF,ZF,PF,CF)
+    or t1, t1, t2, flags=(OF,SF,ZF,PF,CF,AF)
     st t1, seg, sib, disp
 };
 
@@ -52,7 +52,7 @@ def macroop OR_P_I
     limm t2, imm
     rdip t7
     ldst t1, seg, riprel, disp
-    or t1, t1, t2, flags=(OF,SF,ZF,PF,CF)
+    or t1, t1, t2, flags=(OF,SF,ZF,PF,CF,AF)
     st t1, seg, riprel, disp
 };
 
@@ -61,7 +61,7 @@ def macroop OR_LOCKED_M_I
     limm t2, imm
     mfence
     ldstl t1, seg, sib, disp
-    or t1, t1, t2, flags=(OF,SF,ZF,PF,CF)
+    or t1, t1, t2, flags=(OF,SF,ZF,PF,CF,AF)
     stul t1, seg, sib, disp
     mfence
 };
@@ -72,7 +72,7 @@ def macroop OR_LOCKED_P_I
     rdip t7
     mfence
     ldstl t1, seg, riprel, disp
-    or t1, t1, t2, flags=(OF,SF,ZF,PF,CF)
+    or t1, t1, t2, flags=(OF,SF,ZF,PF,CF,AF)
     stul t1, seg, riprel, disp
     mfence
 };
@@ -80,7 +80,7 @@ def macroop OR_LOCKED_P_I
 def macroop OR_M_R
 {
     ldst t1, seg, sib, disp
-    or t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    or t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     st t1, seg, sib, disp
 };
 
@@ -88,7 +88,7 @@ def macroop OR_P_R
 {
     rdip t7
     ldst t1, seg, riprel, disp
-    or t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    or t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     st t1, seg, riprel, disp
 };
 
@@ -96,7 +96,7 @@ def macroop OR_LOCKED_M_R
 {
     mfence
     ldstl t1, seg, sib, disp
-    or t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    or t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     stul t1, seg, sib, disp
     mfence
 };
@@ -106,7 +106,7 @@ def macroop OR_LOCKED_P_R
     rdip t7
     mfence
     ldstl t1, seg, riprel, disp
-    or t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    or t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     stul t1, seg, riprel, disp
     mfence
 };
@@ -114,38 +114,38 @@ def macroop OR_LOCKED_P_R
 def macroop OR_R_M
 {
     ld t1, seg, sib, disp
-    or reg, reg, t1, flags=(OF,SF,ZF,PF,CF)
+    or reg, reg, t1, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop OR_R_P
 {
     rdip t7
     ld t1, seg, riprel, disp
-    or reg, reg, t1, flags=(OF,SF,ZF,PF,CF)
+    or reg, reg, t1, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop OR_R_I
 {
     limm t1, imm
-    or reg, reg, t1, flags=(OF,SF,ZF,PF,CF)
+    or reg, reg, t1, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop XOR_R_R
 {
-    xor reg, reg, regm, flags=(OF,SF,ZF,PF,CF)
+    xor reg, reg, regm, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop XOR_R_I
 {
     limm t1, imm
-    xor reg, reg, t1, flags=(OF,SF,ZF,PF,CF)
+    xor reg, reg, t1, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop XOR_M_I
 {
     limm t2, imm
     ldst t1, seg, sib, disp
-    xor t1, t1, t2, flags=(OF,SF,ZF,PF,CF)
+    xor t1, t1, t2, flags=(OF,SF,ZF,PF,CF,AF)
     st t1, seg, sib, disp
 };
 
@@ -154,7 +154,7 @@ def macroop XOR_P_I
     limm t2, imm
     rdip t7
     ldst t1, seg, riprel, disp
-    xor t1, t1, t2, flags=(OF,SF,ZF,PF,CF)
+    xor t1, t1, t2, flags=(OF,SF,ZF,PF,CF,AF)
     st t1, seg, riprel, disp
 };
 
@@ -163,7 +163,7 @@ def macroop XOR_LOCKED_M_I
     limm t2, imm
     mfence
     ldstl t1, seg, sib, disp
-    xor t1, t1, t2, flags=(OF,SF,ZF,PF,CF)
+    xor t1, t1, t2, flags=(OF,SF,ZF,PF,CF,AF)
     stul t1, seg, sib, disp
     mfence
 };
@@ -174,7 +174,7 @@ def macroop XOR_LOCKED_P_I
     rdip t7
     mfence
     ldstl t1, seg, riprel, disp
-    xor t1, t1, t2, flags=(OF,SF,ZF,PF,CF)
+    xor t1, t1, t2, flags=(OF,SF,ZF,PF,CF,AF)
     stul t1, seg, riprel, disp
     mfence
 };
@@ -182,7 +182,7 @@ def macroop XOR_LOCKED_P_I
 def macroop XOR_M_R
 {
     ldst t1, seg, sib, disp
-    xor t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    xor t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     st t1, seg, sib, disp
 };
 
@@ -190,7 +190,7 @@ def macroop XOR_P_R
 {
     rdip t7
     ldst t1, seg, riprel, disp
-    xor t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    xor t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     st t1, seg, riprel, disp
 };
 
@@ -198,7 +198,7 @@ def macroop XOR_LOCKED_M_R
 {
     mfence
     ldstl t1, seg, sib, disp
-    xor t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    xor t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     stul t1, seg, sib, disp
     mfence
 };
@@ -208,7 +208,7 @@ def macroop XOR_LOCKED_P_R
     rdip t7
     mfence
     ldstl t1, seg, riprel, disp
-    xor t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    xor t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     stul t1, seg, riprel, disp
     mfence
 };
@@ -216,45 +216,45 @@ def macroop XOR_LOCKED_P_R
 def macroop XOR_R_M
 {
     ld t1, seg, sib, disp
-    xor reg, reg, t1, flags=(OF,SF,ZF,PF,CF)
+    xor reg, reg, t1, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop XOR_R_P
 {
     rdip t7
     ld t1, seg, riprel, disp
-    xor reg, reg, t1, flags=(OF,SF,ZF,PF,CF)
+    xor reg, reg, t1, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop AND_R_R
 {
-    and reg, reg, regm, flags=(OF,SF,ZF,PF,CF)
+    and reg, reg, regm, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop AND_R_M
 {
     ld t1, seg, sib, disp
-    and reg, reg, t1, flags=(OF,SF,ZF,PF,CF)
+    and reg, reg, t1, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop AND_R_P
 {
     rdip t7
     ld t1, seg, riprel, disp
-    and reg, reg, t1, flags=(OF,SF,ZF,PF,CF)
+    and reg, reg, t1, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop AND_R_I
 {
     limm t1, imm
-    and reg, reg, t1, flags=(OF,SF,ZF,PF,CF)
+    and reg, reg, t1, flags=(OF,SF,ZF,PF,CF,AF)
 };
 
 def macroop AND_M_I
 {
     ldst t2, seg, sib, disp
     limm t1, imm
-    and t2, t2, t1, flags=(OF,SF,ZF,PF,CF)
+    and t2, t2, t1, flags=(OF,SF,ZF,PF,CF,AF)
     st t2, seg, sib, disp
 };
 
@@ -263,7 +263,7 @@ def macroop AND_P_I
     rdip t7
     ldst t2, seg, riprel, disp
     limm t1, imm
-    and t2, t2, t1, flags=(OF,SF,ZF,PF,CF)
+    and t2, t2, t1, flags=(OF,SF,ZF,PF,CF,AF)
     st t2, seg, riprel, disp
 };
 
@@ -272,7 +272,7 @@ def macroop AND_LOCKED_M_I
     mfence
     ldstl t2, seg, sib, disp
     limm t1, imm
-    and t2, t2, t1, flags=(OF,SF,ZF,PF,CF)
+    and t2, t2, t1, flags=(OF,SF,ZF,PF,CF,AF)
     stul t2, seg, sib, disp
     mfence
 };
@@ -283,7 +283,7 @@ def macroop AND_LOCKED_P_I
     mfence
     ldstl t2, seg, riprel, disp
     limm t1, imm
-    and t2, t2, t1, flags=(OF,SF,ZF,PF,CF)
+    and t2, t2, t1, flags=(OF,SF,ZF,PF,CF,AF)
     stul t2, seg, riprel, disp
     mfence
 };
@@ -291,7 +291,7 @@ def macroop AND_LOCKED_P_I
 def macroop AND_M_R
 {
     ldst t1, seg, sib, disp
-    and t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    and t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     st t1, seg, sib, disp
 };
 
@@ -299,7 +299,7 @@ def macroop AND_P_R
 {
     rdip t7
     ldst t1, seg, riprel, disp
-    and t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    and t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     st t1, seg, riprel, disp
 };
 
@@ -307,7 +307,7 @@ def macroop AND_LOCKED_M_R
 {
     mfence
     ldstl t1, seg, sib, disp
-    and t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    and t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     stul t1, seg, sib, disp
     mfence
 };
@@ -317,7 +317,7 @@ def macroop AND_LOCKED_P_R
     rdip t7
     mfence
     ldstl t1, seg, riprel, disp
-    and t1, t1, reg, flags=(OF,SF,ZF,PF,CF)
+    and t1, t1, reg, flags=(OF,SF,ZF,PF,CF,AF)
     stul t1, seg, riprel, disp
     mfence
 };
diff --git a/src/arch/x86/isa/microops/fpop.isa b/src/arch/x86/isa/microops/fpop.isa
index b0b925f679..aa89ca9347 100644
--- a/src/arch/x86/isa/microops/fpop.isa
+++ b/src/arch/x86/isa/microops/fpop.isa
@@ -257,7 +257,14 @@ let {{
             super().__init__(reg1, reg2, reg3, **kwargs)
 
     class Movfp(Fp2Op):
-        code = 'FpDestReg_uqw = FpSrcReg1_uqw;'
+        code = '''
+        if (dataSize == 4) {
+            FpDestReg_uqw = mbits(FpDestReg_uqw, 63, 32) |
+                            mbits(FpSrcReg1_uqw, 31, 0);
+        } else {
+            FpDestReg_uqw = FpSrcReg1_uqw;
+        }
+        '''
         else_code = 'FpDestReg_uqw = FpDestReg_uqw;'
         cond_check = "checkCondition(ccFlagBits | cfofBits | dfBit | \
                                      ecfBit | ezfBit, src1)"
diff --git a/src/arch/x86/isa/microops/regop.isa b/src/arch/x86/isa/microops/regop.isa
index c7e9f46f10..0bca2b5418 100644
--- a/src/arch/x86/isa/microops/regop.isa
+++ b/src/arch/x86/isa/microops/regop.isa
@@ -84,6 +84,14 @@ def template MicroRegOpDeclare {{
             %(set_reg_idx_arr)s;
             %(constructor)s;
             %(cond_control_flag_init)s;
+
+            // In some cases, we need to invalidate some source registers
+            // to eliminate unnecessary input dependencies (e.g., for a
+            // 32-bit or 64-bit 'mov' micro-op) on the prior value of the
+            // destination register. (Such dependencies are required for
+            // 8-bit or 16-bit moves, however, since they don't completely
+            // overwrite the destination register.)
+            %(invalidate_srcs)s;
         }
 
         Fault execute(ExecContext *, trace::InstRecord *) const override;
@@ -182,7 +190,7 @@ let {{
     class RegOpMeta(type):
         def buildCppClasses(self, name, Name, suffix, code, big_code, \
                 flag_code, cond_check, else_code, cond_control_flag_init,
-                op_class, operand_types):
+                invalidate_srcs, op_class, operand_types):
 
             # Globals to stick the output in
             global header_output
@@ -191,9 +199,9 @@ let {{
 
             # Stick all the code together so it can be searched at once
             allCode = "|".join((code, flag_code, cond_check, else_code,
-                                cond_control_flag_init))
+                                cond_control_flag_init, invalidate_srcs))
             allBigCode = "|".join((big_code, flag_code, cond_check, else_code,
-                                   cond_control_flag_init))
+                                   cond_control_flag_init, invalidate_srcs))
 
             # If op2 is used anywhere, make register and immediate versions
             # of this code.
@@ -219,6 +227,7 @@ let {{
                         matcher.sub(src2_name, cond_check),
                         matcher.sub(src2_name, else_code),
                         matcher.sub(src2_name, cond_control_flag_init),
+                        matcher.sub(src2_name, invalidate_srcs),
                         op_class, operand_types)
                 imm_name = '(int8_t)imm8' if match.group("prefix") else 'imm8'
                 self.buildCppClasses(name + "i", Name, suffix + "Imm",
@@ -228,6 +237,7 @@ let {{
                         matcher.sub(imm_name, cond_check),
                         matcher.sub(imm_name, else_code),
                         matcher.sub(imm_name, cond_control_flag_init),
+                        matcher.sub(imm_name, invalidate_srcs),
                         op_class, imm_operand_types)
                 return
 
@@ -236,8 +246,8 @@ let {{
             if flag_code != "" or cond_check != "true":
                 self.buildCppClasses(name, Name, suffix,
                         code, big_code, "", "true", else_code,
-                        "flags[IsUncondControl] = flags[IsControl];", op_class,
-                        operand_types)
+                        "flags[IsUncondControl] = flags[IsControl];",
+                        invalidate_srcs, op_class, operand_types)
                 suffix = "Flags" + suffix
 
             cxx_classes = list([op.cxx_class() for op in operand_types])
@@ -255,6 +265,7 @@ let {{
                      "cond_check" : cond_check,
                      "else_code" : else_code,
                      "cond_control_flag_init" : cond_control_flag_init,
+                     "invalidate_srcs": invalidate_srcs,
                      "op_class" : op_class})]
             if big_code != "":
                 iops += [InstObjParams(name, Name + suffix + "Big", base,
@@ -263,6 +274,7 @@ let {{
                           "cond_check" : cond_check,
                           "else_code" : else_code,
                           "cond_control_flag_init" : cond_control_flag_init,
+                          "invalidate_srcs": invalidate_srcs,
                           "op_class" : op_class})]
 
             # Generate the actual code (finally!)
@@ -292,20 +304,22 @@ let {{
             cond_check = cls.cond_check
             else_code = cls.else_code
             cond_control_flag_init = cls.cond_control_flag_init
+            invalidate_srcs = cls.invalidate_srcs
             op_class = cls.op_class
             operand_types = cls.operand_types
 
             # Set up the C++ classes
             mcls.buildCppClasses(cls, name, Name, "", code, big_code,
                     flag_code, cond_check, else_code,
-                    cond_control_flag_init, op_class, operand_types)
+                    cond_control_flag_init, invalidate_srcs, op_class,
+                    operand_types)
 
             # Hook into the microassembler dict
             global microopClasses
             microopClasses[name] = cls
 
             allCode = "|".join((code, flag_code, cond_check, else_code,
-                                cond_control_flag_init))
+                                cond_control_flag_init, invalidate_srcs))
 
             # If op2 is used anywhere, make register and immediate versions
             # of this code.
@@ -326,6 +340,7 @@ let {{
         cond_check = "true"
         else_code = ";"
         cond_control_flag_init = ""
+        invalidate_srcs = ""
         op_class = "IntAluOp"
 
         def __init__(self, *ops, flags=None, dataSize="env.dataSize"):
@@ -398,8 +413,8 @@ let {{
     class LogicRegOp(BasicRegOp):
         abstract = True
         flag_code = '''
-            //Don't have genFlags handle the OF or CF bits
-            uint64_t mask = CFBit | ECFBit | OFBit;
+            //Don't have genFlags handle the OF, CF, or AF bits
+            uint64_t mask = CFBit | ECFBit | OFBit | AFBit;
             uint64_t newFlags = genFlags(PredccFlagBits | PreddfBit |
                     PredezfBit, ext & ~mask, result, PSrcReg1, op2);
             PredezfBit = newFlags & EZFBit;
@@ -407,7 +422,7 @@ let {{
             PredccFlagBits = newFlags & CcFlagMask;
 
             //If a logic microop wants to set these, it wants to set them to 0.
-            PredcfofBits = PredcfofBits & ~((CFBit | OFBit) & ext);
+            PredcfofBits = PredcfofBits & ~((CFBit | OFBit | AFBit) & ext);
             PredecfBit = PredecfBit & ~(ECFBit & ext);
         '''
 
@@ -702,6 +717,21 @@ let {{
     class Mov(BasicRegOp, CondRegOp):
         code = 'DestReg = merge(SrcReg1, dest, op2, dataSize)'
         else_code = 'DestReg = DestReg;'
+        invalidate_srcs = '''
+            // 4-byte and 8-byte register moves completely overwrites
+            // the destination register. Therefore, we can safely
+            // eliminate the destination register's original contents
+            // as a source dependency.
+            // Note: Source index 1 is actual architectural source register.
+            // Note: Because there are other variants of the 'mov' micro-op,
+            // we need to check the number of source registers to ensure
+            // we're operating on the register-register version, not the
+            // register-immediate version, 'movi', say.
+            if (numSrcRegs() == 3 && (dataSize == 4 || dataSize == 8)) {
+                setSrcRegIdx(0, RegId());
+                setSrcRegIdx(2, RegId());
+            }
+        '''
 
     # Shift instructions
 
diff --git a/src/arch/x86/kvm/x86_cpu.cc b/src/arch/x86/kvm/x86_cpu.cc
index da6e1bb9e1..a863ed79a2 100644
--- a/src/arch/x86/kvm/x86_cpu.cc
+++ b/src/arch/x86/kvm/x86_cpu.cc
@@ -732,6 +732,7 @@ X86KvmCPU::updateKvmState()
     updateKvmStateSRegs();
     updateKvmStateFPU();
     updateKvmStateMSRs();
+    updateKvmStateXCRs();
 
     DPRINTF(KvmContext, "X86KvmCPU::updateKvmState():\n");
     if (debug::KvmContext)
@@ -996,6 +997,24 @@ X86KvmCPU::updateKvmStateMSRs()
     setMSRs(msrs);
 }
 
+void
+X86KvmCPU::updateKvmStateXCRs()
+{
+    if (haveXCRs) {
+        struct kvm_xcrs xcrs;
+
+        xcrs.nr_xcrs = NumXCRegs;
+        xcrs.flags = 0;
+
+        for (int i = 0; i < xcrs.nr_xcrs; ++i) {
+            xcrs.xcrs[i].xcr = i;
+            xcrs.xcrs[i].value = tc->readMiscReg(misc_reg::xcr(i));
+        }
+
+        setXCRs(xcrs);
+    }
+}
+
 void
 X86KvmCPU::updateThreadContext()
 {
@@ -1023,6 +1042,7 @@ X86KvmCPU::updateThreadContext()
         updateThreadContextFPU(fpu);
     }
     updateThreadContextMSRs();
+    updateThreadContextXCRs();
 
     // The M5 misc reg caches some values from other
     // registers. Writing to it with side effects causes it to be
@@ -1189,6 +1209,21 @@ X86KvmCPU::updateThreadContextMSRs()
     }
 }
 
+void
+X86KvmCPU::updateThreadContextXCRs()
+{
+    if (haveXCRs) {
+        struct kvm_xcrs xcrs;
+
+        getXCRs(xcrs);
+
+        for (int i = 0; i < xcrs.nr_xcrs; ++i) {
+            tc->setMiscReg(misc_reg::xcr(xcrs.xcrs[i].xcr),
+                           xcrs.xcrs[i].value);
+        }
+    }
+}
+
 void
 X86KvmCPU::deliverInterrupts()
 {
diff --git a/src/arch/x86/kvm/x86_cpu.hh b/src/arch/x86/kvm/x86_cpu.hh
index 33f58d7997..f08d858db8 100644
--- a/src/arch/x86/kvm/x86_cpu.hh
+++ b/src/arch/x86/kvm/x86_cpu.hh
@@ -217,6 +217,8 @@ class X86KvmCPU : public BaseKvmCPU
     void updateKvmStateFPUXSave();
     /** Update MSR registers */
     void updateKvmStateMSRs();
+    /** Update XCR registers */
+    void updateKvmStateXCRs();
     /** @} */
 
     /**
@@ -236,6 +238,8 @@ class X86KvmCPU : public BaseKvmCPU
     void updateThreadContextXSave(const struct kvm_xsave &kxsave);
     /** Update MSR registers */
     void updateThreadContextMSRs();
+    /** Update XCR registers */
+    void updateThreadContextXCRs();
     /** @} */
 
     /** Transfer gem5's CPUID values into the virtual CPU. */
diff --git a/src/arch/x86/linux/linux.hh b/src/arch/x86/linux/linux.hh
index b9598224a6..2b2031cc3c 100644
--- a/src/arch/x86/linux/linux.hh
+++ b/src/arch/x86/linux/linux.hh
@@ -119,6 +119,49 @@ class X86Linux64 : public X86Linux, public OpenFlagTable<X86Linux64>
         int64_t unused0[3];
     };
 
+    struct tgt_statx
+    {
+        /* 0x00 */
+        uint32_t stx_mask;
+        uint32_t stx_blksize;
+        uint64_t stx_attributes;
+        /* 0x10 */
+        uint32_t stx_nlink;
+        uint32_t stx_uid;
+        uint32_t stx_gid;
+        uint16_t stx_mode;
+        uint16_t stx_spare0;
+        /* 0x20 */
+        uint64_t stx_ino;
+        uint64_t stx_size;
+        uint64_t stx_blocks;
+        uint64_t stx_attributes_mask;
+        /* 0x40 */
+        uint64_t stx_atimeX;
+        uint32_t stx_atime_nsec;
+        int32_t  stx_atime_reserved;
+        uint64_t stx_btimeX;
+        uint32_t stx_btime_nsec;
+        int32_t  stx_btime_reserved;
+        uint64_t stx_ctimeX;
+        uint32_t stx_ctime_nsec;
+        int32_t  stx_ctime_reserved;
+        uint64_t stx_mtimeX;
+        uint32_t stx_mtime_nsec;
+        int32_t  stx_mtime_reserved;
+        /* 0x80 */
+        uint32_t stx_rdev_major;
+        uint32_t stx_rdev_minor;
+        uint32_t stx_dev_major;
+        uint32_t stx_dev_minor;
+        /* 0x90 */
+        uint64_t stx_mnt_id;
+        uint64_t stx_spare2;
+        /* 0xa0 */
+        uint64_t stx_spare3[12];
+        /* 0x100 */
+    };
+
     struct tgt_fsid
     {
         long val[2];
diff --git a/src/arch/x86/linux/syscall_tbl64.cc b/src/arch/x86/linux/syscall_tbl64.cc
index 26299d884b..1b3dfaf020 100644
--- a/src/arch/x86/linux/syscall_tbl64.cc
+++ b/src/arch/x86/linux/syscall_tbl64.cc
@@ -375,7 +375,7 @@ SyscallDescTable<EmuLinux::SyscallABI64> EmuLinux::syscallDescs64 = {
     { 329, "pkey_mprotect" },
     { 330, "pkey_alloc" },
     { 331, "pkey_free" },
-    { 332, "statx" },
+    { 332, "statx", statxFunc<X86Linux64> },
     { 333, "io_pgetevents" },
     { 334, "rseq", ignoreFunc },
     { 424, "pidfd_send_signal" },
diff --git a/src/arch/x86/process.cc b/src/arch/x86/process.cc
index 10833783fd..9af685dac6 100644
--- a/src/arch/x86/process.cc
+++ b/src/arch/x86/process.cc
@@ -112,7 +112,7 @@ X86_64Process::X86_64Process(const ProcessParams &params,
 
     Addr brk_point = roundUp(image.maxAddr(), PageBytes);
     Addr stack_base = 0x7FFFFFFFF000ULL;
-    Addr max_stack_size = 8 * 1024 * 1024;
+    Addr max_stack_size = params.maxStackSize;
     Addr next_thread_stack_base = stack_base - max_stack_size;
     Addr mmap_end = 0x7FFFF7FFF000ULL;
 
@@ -398,6 +398,7 @@ X86_64Process::initState()
 
             tc->setMiscReg(misc_reg::Mxcsr, 0x1f80);
             tc->setMiscReg(misc_reg::Ftw, 0xffff);
+            tc->setMiscReg(misc_reg::Fcw, 0x037f);
 
             tc->setMiscReg(misc_reg::ApicBase, 0xfee00900);
 
@@ -617,6 +618,7 @@ X86_64Process::initState()
 
             tc->setMiscReg(misc_reg::Mxcsr, 0x1f80);
             tc->setMiscReg(misc_reg::Ftw, 0xffff);
+            tc->setMiscReg(misc_reg::Fcw, 0x037f);
 
             // Setting CR3 to the process pid so that concatinated
             // page addr with lower 12 bits of CR3 can be used in SE
@@ -752,6 +754,7 @@ I386Process::initState()
 
         tc->setMiscReg(misc_reg::Mxcsr, 0x1f80);
         tc->setMiscReg(misc_reg::Ftw, 0xffff);
+        tc->setMiscReg(misc_reg::Fcw, 0x037f);
     }
 }
 
diff --git a/src/arch/x86/pseudo_inst_abi.hh b/src/arch/x86/pseudo_inst_abi.hh
index e465c7abd5..bfedb7a8cb 100644
--- a/src/arch/x86/pseudo_inst_abi.hh
+++ b/src/arch/x86/pseudo_inst_abi.hh
@@ -37,6 +37,7 @@
 
 #include "arch/x86/regs/int.hh"
 #include "sim/guest_abi.hh"
+#include "sim/pseudo_inst.hh"
 
 namespace gem5
 {
@@ -84,5 +85,29 @@ struct Argument<X86PseudoInstABI, uint64_t>
     }
 };
 
+template <>
+struct Argument<X86PseudoInstABI, pseudo_inst::GuestAddr>
+{
+    using Arg = pseudo_inst::GuestAddr;
+
+    static Arg
+    get(ThreadContext *tc, X86PseudoInstABI::State &state)
+    {
+        // The first 6 integer arguments are passed in registers, the rest
+        // are passed on the stack.
+
+        panic_if(state >= 6, "Too many psuedo inst arguments.");
+
+        using namespace X86ISA;
+
+        constexpr RegId int_reg_map[] = {
+            int_reg::Rdi, int_reg::Rsi, int_reg::Rdx,
+            int_reg::Rcx, int_reg::R8, int_reg::R9
+        };
+
+        return (Arg)tc->getReg(int_reg_map[state++]);
+    }
+};
+
 } // namespace guest_abi
 } // namespace gem5
diff --git a/src/arch/x86/regs/misc.hh b/src/arch/x86/regs/misc.hh
index 1784142dbe..535d251948 100644
--- a/src/arch/x86/regs/misc.hh
+++ b/src/arch/x86/regs/misc.hh
@@ -405,6 +405,9 @@ enum : RegIndex
     // "Fake" MSRs for internally implemented devices
     PciConfigAddress,
 
+    XcrBase,
+    Xcr0 = XcrBase,
+
     NumRegs
 };
 
@@ -424,6 +427,13 @@ cr(int index)
     return CrBase + index;
 }
 
+static inline RegIndex
+xcr(int index)
+{
+    assert(index >= 0 && index < NumXCRegs);
+    return XcrBase + index;
+}
+
 static inline RegIndex
 dr(int index)
 {
@@ -649,6 +659,24 @@ BitUnion64(CR8)
     Bitfield<3, 0> tpr; // Task Priority Register
 EndBitUnion(CR8)
 
+BitUnion64(XCR0)
+    Bitfield<0> x87; // x87 FPU/MMX support (must be 1)
+    Bitfield<1> sse; // XSAVE support for MXCSR and XMM registers
+    Bitfield<2> avx; // AVX enabled and XSAVE support for upper halves of YMM
+                     // registers
+    Bitfield<3> bndreg; // MPX enabled and XSAVE support for BND0-BND3
+                        // registers
+    Bitfield<4> bndsrc; // MPX enabled and XSAVE support for BNDCFGU and
+                        // BNDSTATUS registers
+    Bitfield<5> opmask; // AVX-512 enabled and XSAVE support for opmask
+                        // registers k0-k7
+    Bitfield<6> zmm_hi256; // AVX-512 enabled and XSAVE support for upper
+                           // halves of lower ZMM registers
+    Bitfield<7> hi16_zmm; // AVX-512 enabled and XSAVE support for upper ZMM
+                          // registers
+    Bitfield<9> pkru; // XSAVE support for PKRU register
+EndBitUnion(XCR0)
+
 BitUnion64(DR6)
     Bitfield<0> b0;
     Bitfield<1> b1;
diff --git a/src/arch/x86/x86_traits.hh b/src/arch/x86/x86_traits.hh
index e4a639277c..8006524bb7 100644
--- a/src/arch/x86/x86_traits.hh
+++ b/src/arch/x86/x86_traits.hh
@@ -55,6 +55,7 @@ namespace X86ISA
 
     const int NumCRegs = 16;
     const int NumDRegs = 8;
+    const int NumXCRegs = 1;
 
     const int NumSegments = 6;
     const int NumSysSegments = 4;
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index 11fb1cd668..d49ebf49b6 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -455,7 +455,25 @@ class AddrRange
             return r.contains(_start) && r.contains(_end - 1) &&
                 size() <= r.granularity();
         } else {
-            return _start >= r._start && _end <= r._end;
+
+            if (_end <= _start){
+                // Special case: if our range wraps around that is
+                // _end is 2^64 so it wraps to 0.
+                // In this case we will be a subset only if r._end
+                // also wraps around.
+                return _start >= r._start && r._end == 0;
+            } else if (r._end <= r._start){
+                // Special case: if r wraps around that is
+                // r._end is 2^64 so it wraps to 0.
+                // In this case we will be a subset only if our _start
+                // is within r._start/ _end does not matter
+                // because r wraps around.
+                return _start >= r._start;
+            } else {
+                // Normal case: Check if our range is completely within 'r'.
+                return _start >= r._start && _end <= r._end;
+            }
+
         }
     }
 
diff --git a/src/base/addr_range.test.cc b/src/base/addr_range.test.cc
index 1e861544ff..eb7bed971a 100644
--- a/src/base/addr_range.test.cc
+++ b/src/base/addr_range.test.cc
@@ -1544,6 +1544,36 @@ TEST(AddrRangeTest, SubtractionAssignmentOfRangeListFromRangeList)
                 expected_range3, expected_range4));
 }
 
+TEST(AddrRangeTest, isNotSubsetLastByte)
+{
+    /* An issue raised in https://github.com/gem5/gem5/issues/240 where if an
+     * address range ends at the last byte of a 64 bit address space, it will
+     * be considered a subset of any other address range that starts at the
+     * first byte of the range.
+     */
+
+    AddrRange first_four_bytes = RangeSize(0x0, 4);
+    AddrRange last_four_bytes = RangeSize(0xfffffffffffffffc, 4);
+
+    EXPECT_FALSE(last_four_bytes.isSubset(first_four_bytes));
+}
+
+TEST(AddrRangeTest, isSubsetLastByte)
+{
+    /* An issue raised in https://github.com/gem5/gem5/issues/240 where if an
+     * address range ends at the last byte of a 64 bit address space, it will
+     * be considered a subset of any other address range that starts at the
+     * first byte of the range.
+     *
+     * This test checks it subset works correctly when the range is the last
+     * byte of the address space.
+     */
+    AddrRange last_four_bytes = RangeSize(0xfffffffffffffffc, 4);
+    AddrRange not_wrapped_last_bytes = RangeSize(0xfffffffffffffffc, 3);
+
+    EXPECT_TRUE(not_wrapped_last_bytes.isSubset(last_four_bytes));
+}
+
 /*
  * InterleavingRanges:
  * The exclude method does not support interleaving ranges
diff --git a/src/base/bitunion.test.cc b/src/base/bitunion.test.cc
index 06c7a615e9..be92bfa853 100644
--- a/src/base/bitunion.test.cc
+++ b/src/base/bitunion.test.cc
@@ -129,7 +129,7 @@ containingFunc(uint64_t init_val, uint64_t fieldVal)
 // Declare these as global so g++ doesn't ignore them. Initialize them in
 // various ways.
 EmptySixtyFour emptySixtyFour = 0;
-EmptyThirtyTwo emptyThirtyTwo{};
+[[maybe_unused]] EmptyThirtyTwo emptyThirtyTwo{};
 [[maybe_unused]] EmptySixteen emptySixteen;
 EmptyEight emptyEight(0);
 
diff --git a/src/base/cache/associative_cache.hh b/src/base/cache/associative_cache.hh
new file mode 100644
index 0000000000..113709d583
--- /dev/null
+++ b/src/base/cache/associative_cache.hh
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2024 Pranith Kumar
+ * Copyright (c) 2018 Metempsy Technology Consulting
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __BASE_CACHE_ASSOCIATIVE_CACHE_HH__
+#define __BASE_CACHE_ASSOCIATIVE_CACHE_HH__
+
+#include <type_traits>
+#include <vector>
+
+#include "base/cache/cache_entry.hh"
+#include "base/intmath.hh"
+#include "base/logging.hh"
+#include "base/named.hh"
+#include "base/types.hh"
+#include "mem/cache/replacement_policies/base.hh"
+#include "mem/cache/replacement_policies/replaceable_entry.hh"
+#include "mem/cache/tags/indexing_policies/base.hh"
+
+namespace gem5
+{
+
+template <typename Entry>
+class AssociativeCache : public Named
+{
+    static_assert(std::is_base_of_v<CacheEntry, Entry>,
+                  "Entry should be derived from CacheEntry");
+
+    typedef replacement_policy::Base BaseReplacementPolicy;
+
+  protected:
+
+    /** Associativity of the cache. */
+    size_t associativity;
+
+    /** The replacement policy of the cache. */
+    BaseReplacementPolicy *replPolicy;
+
+    /** Indexing policy of the cache */
+    BaseIndexingPolicy *indexingPolicy;
+
+    /** The entries */
+    std::vector<Entry> entries;
+
+  private:
+
+    void
+    initParams(size_t _num_entries, size_t _assoc)
+    {
+        fatal_if((_num_entries % _assoc) != 0, "The number of entries of an "
+                 "AssociativeCache<> must be a multiple of its associativity");
+        for (auto entry_idx = 0; entry_idx < _num_entries; entry_idx++) {
+            Entry *entry = &entries[entry_idx];
+            indexingPolicy->setEntry(entry, entry_idx);
+            entry->replacementData = replPolicy->instantiateEntry();
+        }
+    }
+
+  public:
+
+    /**
+     * Empty constructor - need to call init() later with all args
+     */
+    AssociativeCache(const char *name) : Named(std::string(name)) {}
+
+    /**
+     * Public constructor
+     * @param name Name of the cache
+     * @param num_entries total number of entries of the container, the number
+     *   of sets can be calculated dividing this balue by the 'assoc' value
+     * @param associativity number of elements in each associative set
+     * @param repl_policy replacement policy
+     * @param indexing_policy indexing policy
+     */
+    AssociativeCache(const char *name, const size_t num_entries,
+                     const size_t associativity_,
+                     BaseReplacementPolicy *repl_policy,
+                     BaseIndexingPolicy *indexing_policy,
+                     Entry const &init_val = Entry())
+        : Named(std::string(name)),
+          associativity(associativity_),
+          replPolicy(repl_policy),
+          indexingPolicy(indexing_policy),
+          entries(num_entries, init_val)
+    {
+        initParams(num_entries, associativity);
+    }
+
+    /**
+     * Default destructor
+     */
+    ~AssociativeCache()  = default;
+
+    /**
+     * Disable copy and assignment
+     */
+    AssociativeCache(const AssociativeCache&) = delete;
+    AssociativeCache& operator=(const AssociativeCache&) = delete;
+
+    /**
+     * Clear the entries in the cache.
+     */
+    void
+    clear()
+    {
+        for (auto &entry : entries) {
+            invalidate(&entry);
+        }
+    }
+
+    void
+    init(const size_t num_entries,
+         const size_t associativity_,
+         BaseReplacementPolicy *_repl_policy,
+         BaseIndexingPolicy *_indexing_policy,
+         Entry const &init_val = Entry())
+    {
+        associativity = associativity_;
+        replPolicy = _repl_policy;
+        indexingPolicy = _indexing_policy;
+        entries.resize(num_entries, init_val);
+
+        initParams(num_entries, associativity);
+    }
+
+    /**
+     * Get the tag for the addr
+     * @param addr Addr to get the tag for
+     * @return Tag for the address
+     */
+    virtual Addr
+    getTag(const Addr addr) const
+    {
+        return indexingPolicy->extractTag(addr);
+    }
+
+    /**
+     * Do an access to the entry if it exists.
+     * This is required to update the replacement information data.
+     * @param addr key to the entry
+     * @return The entry if it exists
+     */
+    virtual Entry*
+    accessEntryByAddr(const Addr addr)
+    {
+        auto entry = findEntry(addr);
+
+        if (entry) {
+            accessEntry(entry);
+        }
+
+        return entry;
+    }
+
+    /**
+     * Update the replacement information for an entry
+     * @param Entry to access and upate
+     */
+    virtual void
+    accessEntry(Entry *entry)
+    {
+        replPolicy->touch(entry->replacementData);
+    }
+
+    /**
+     * Find an entry within the set
+     * @param addr key element
+     * @return returns a pointer to the wanted entry or nullptr if it does not
+     *  exist.
+     */
+    virtual Entry*
+    findEntry(const Addr addr) const
+    {
+        auto tag = getTag(addr);
+
+        auto candidates = indexingPolicy->getPossibleEntries(addr);
+
+        for (auto candidate : candidates) {
+            Entry *entry = static_cast<Entry*>(candidate);
+            if (entry->matchTag(tag)) {
+                return entry;
+            }
+        }
+
+        return nullptr;
+    }
+
+    /**
+     * Find a victim to be replaced
+     * @param addr key to select the possible victim
+     * @result entry to be victimized
+     */
+    virtual Entry*
+    findVictim(const Addr addr)
+    {
+        auto candidates = indexingPolicy->getPossibleEntries(addr);
+
+        auto victim = static_cast<Entry*>(replPolicy->getVictim(candidates));
+
+        invalidate(victim);
+
+        return victim;
+    }
+
+    /**
+     * Invalidate an entry and its respective replacement data.
+     *
+     * @param entry Entry to be invalidated.
+     */
+    virtual void
+    invalidate(Entry *entry)
+    {
+        entry->invalidate();
+        replPolicy->invalidate(entry->replacementData);
+    }
+
+    /**
+     * Indicate that an entry has just been inserted
+     * @param addr key of the container
+     * @param entry pointer to the container entry to be inserted
+     */
+    virtual void
+    insertEntry(const Addr addr, Entry *entry)
+    {
+        entry->insert(indexingPolicy->extractTag(addr));
+        replPolicy->reset(entry->replacementData);
+    }
+
+    /**
+     * Find the set of entries that could be replaced given
+     * that we want to add a new entry with the provided key
+     * @param addr key to select the set of entries
+     * @result vector of candidates matching with the provided key
+     */
+    std::vector<Entry *>
+    getPossibleEntries(const Addr addr) const
+    {
+        std::vector<ReplaceableEntry *> selected_entries =
+            indexingPolicy->getPossibleEntries(addr);
+
+        std::vector<Entry *> entries;
+
+        std::transform(selected_entries.begin(), selected_entries.end(),
+                       std::back_inserter(entries), [](auto &entry) {
+                           return static_cast<Entry *>(entry);
+                       });
+
+        return entries;
+    }
+
+    /** Iterator types */
+    using const_iterator = typename std::vector<Entry>::const_iterator;
+    using iterator = typename std::vector<Entry>::iterator;
+
+    /**
+     * Returns an iterator to the first entry of the dictionary
+     * @result iterator to the first element
+     */
+    iterator
+    begin()
+    {
+        return entries.begin();
+    }
+
+    /**
+     * Returns an iterator pointing to the end of the the dictionary
+     * (placeholder element, should not be accessed)
+     * @result iterator to the end element
+     */
+    iterator
+    end()
+    {
+        return entries.end();
+    }
+
+    /**
+     * Returns an iterator to the first entry of the dictionary
+     * @result iterator to the first element
+     */
+    const_iterator
+    begin() const
+    {
+        return entries.begin();
+    }
+
+    /**
+     * Returns an iterator pointing to the end of the the dictionary
+     * (placeholder element, should not be accessed)
+     * @result iterator to the end element
+     */
+    const_iterator
+    end() const
+    {
+        return entries.end();
+    }
+};
+
+}
+
+#endif
diff --git a/src/base/cache/cache_entry.hh b/src/base/cache/cache_entry.hh
new file mode 100644
index 0000000000..9466da3fc1
--- /dev/null
+++ b/src/base/cache/cache_entry.hh
@@ -0,0 +1,136 @@
+/**
+ * Copyright (c) 2024 - Pranith Kumar
+ * Copyright (c) 2020 Inria
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __BASE_CACHE_CACHE_ENTRY_HH__
+#define __BASE_CACHE_CACHE_ENTRY_HH__
+
+#include <cassert>
+
+#include "base/cprintf.hh"
+#include "base/types.hh"
+#include "mem/cache/replacement_policies/replaceable_entry.hh"
+
+namespace gem5
+{
+
+/**
+ * A CacheEntry is an entry containing a tag. A tagged entry's contents
+ * are only relevant if it is marked as valid.
+ */
+class CacheEntry : public ReplaceableEntry
+{
+  public:
+    CacheEntry() = default;
+    ~CacheEntry() = default;
+
+    /**
+     * Checks if the entry is valid.
+     *
+     * @return True if the entry is valid.
+     */
+    virtual bool isValid() const { return valid; }
+
+    /**
+     * Get tag associated to this block.
+     *
+     * @return The tag value.
+     */
+    virtual Addr getTag() const { return tag; }
+
+    /**
+     * Checks if the given tag information corresponds to this entry's.
+     *
+     * @param tag The tag value to compare to.
+     * @return True if the tag information match this entry's.
+     */
+    virtual bool
+    matchTag(const Addr tag) const
+    {
+        return isValid() && (getTag() == tag);
+    }
+
+    /**
+     * Insert the block by assigning it a tag and marking it valid. Touches
+     * block if it hadn't been touched previously.
+     *
+     * @param tag The tag value.
+     */
+    virtual void
+    insert(const Addr tag)
+    {
+        setValid();
+        setTag(tag);
+    }
+
+    /** Invalidate the block. Its contents are no longer valid. */
+    virtual void
+    invalidate()
+    {
+        valid = false;
+        setTag(MaxAddr);
+    }
+
+    std::string
+    print() const override
+    {
+        return csprintf("tag: %#x valid: %d | %s", getTag(),
+                        isValid(), ReplaceableEntry::print());
+    }
+
+  protected:
+    /**
+     * Set tag associated to this block.
+     *
+     * @param tag The tag value.
+     */
+    virtual void setTag(Addr _tag) { tag = _tag; }
+
+    /** Set valid bit. The block must be invalid beforehand. */
+    virtual void
+    setValid()
+    {
+        assert(!isValid());
+        valid = true;
+    }
+
+  private:
+    /**
+     * Valid bit. The contents of this entry are only valid if this bit is set.
+     * @sa invalidate()
+     * @sa insert()
+     */
+    bool valid{false};
+
+    /** The entry's tag. */
+    Addr tag{MaxAddr};
+};
+
+} // namespace gem5
+
+#endif //__CACHE_ENTRY_HH__
diff --git a/src/base/compiler.hh b/src/base/compiler.hh
index 29375c1900..9c80c3ef09 100644
--- a/src/base/compiler.hh
+++ b/src/base/compiler.hh
@@ -90,7 +90,7 @@
 // This version is for macros which are statement-like, which frequently use
 // "do {} while (0)" to make their syntax look more like normal c++ statements.
 #  define GEM5_DEPRECATED_MACRO_STMT(name, definition, message) \
-     do {{definition;} GEM5_DEPRECATED_MACRO(name, {}, message);} while (0)
+     do {{definition;} GEM5_DEPRECATED_MACRO(name, ({}), message);} while (0)
 
 // To mark a class as deprecated in favor of a new name, add a respective
 // instance of this macro to the file that used to declare the old name.
diff --git a/src/base/loader/image_file_data.cc b/src/base/loader/image_file_data.cc
index 525d577936..b402b3bbb8 100644
--- a/src/base/loader/image_file_data.cc
+++ b/src/base/loader/image_file_data.cc
@@ -64,13 +64,10 @@ doGzipLoad(int fd)
         return -1;
     }
 
-    size_t tmp_len = strlen(P_tmpdir);
-    char *tmpnam = (char*) malloc(tmp_len + 20);
-    strcpy(tmpnam, P_tmpdir);
-    strcpy(tmpnam+tmp_len, "/gem5-gz-obj-XXXXXX"); // 19 chars
+    std::string tmpnam_str = std::string(P_tmpdir) + "/gem5-gz-obj-XXXXXX";
+    char *tmpnam = const_cast<char*>(tmpnam_str.c_str());
     fd = mkstemp(tmpnam); // repurposing fd variable for output
     if (fd < 0) {
-        free(tmpnam);
         gzclose(fdz);
         return fd;
     }
@@ -78,8 +75,6 @@ doGzipLoad(int fd)
     if (unlink(tmpnam) != 0)
         warn("couldn't remove temporary file %s\n", tmpnam);
 
-    free(tmpnam);
-
     auto buf = new uint8_t[blk_sz];
     int r; // size of (r)emaining uncopied data in (buf)fer
     while ((r = gzread(fdz, buf, blk_sz)) > 0) {
diff --git a/src/base/loader/symtab.test.cc b/src/base/loader/symtab.test.cc
index 9a7b8c86b9..2de89aed75 100644
--- a/src/base/loader/symtab.test.cc
+++ b/src/base/loader/symtab.test.cc
@@ -720,7 +720,7 @@ TEST(LoaderSymtabTest, FindNearestRoundWithNext)
     EXPECT_TRUE(symtab.insert(symbols[0]));
     EXPECT_TRUE(symtab.insert(symbols[1]));
 
-    Addr next_addr;
+    Addr next_addr = 0;
     const auto it = symtab.findNearest(symbols[0].address() + 0x1, next_addr);
     ASSERT_NE(it, symtab.end());
     ASSERT_PRED_FORMAT2(checkSymbol, *it, symbols[0]);
@@ -741,7 +741,7 @@ TEST(LoaderSymtabTest, FindNearestRoundWithNextNonExistent)
             "symbol", 0x10};
     EXPECT_TRUE(symtab.insert(symbol));
 
-    Addr next_addr;
+    Addr next_addr = 0;
     const auto it = symtab.findNearest(symbol.address() + 0x1, next_addr);
     ASSERT_NE(it, symtab.end());
     ASSERT_PRED_FORMAT2(checkSymbol, *it, symbol);
diff --git a/src/base/version.cc b/src/base/version.cc
index 30b2bddb73..dedbb425d3 100644
--- a/src/base/version.cc
+++ b/src/base/version.cc
@@ -32,6 +32,6 @@ namespace gem5
 /**
  * @ingroup api_base_utils
  */
-const char *gem5Version = "23.1.0.0";
+const char *gem5Version = "24.0.0.0";
 
 } // namespace gem5
diff --git a/src/cpu/BaseCPU.py b/src/cpu/BaseCPU.py
index 9ba60ef1b8..2b3dcc6a16 100644
--- a/src/cpu/BaseCPU.py
+++ b/src/cpu/BaseCPU.py
@@ -302,17 +302,26 @@ class BaseCPU(ClockedObject):
 
         # Practically speaking, these ports will exist on the x86 interrupt
         # controller class.
+        # Note: _uncached_interrupt_response_ports is a *class* variable
+        # not a *instance* variable. Thus, we cannot use `extend` here.
         if "pio" in self.ArchInterrupts._ports:
             self._uncached_interrupt_response_ports = (
-                self._uncached_interrupt_response_ports + ["interrupts[0].pio"]
+                self._uncached_interrupt_response_ports
+                + [f"interrupts[{i}].pio" for i in range(self.numThreads)]
             )
         if "int_responder" in self.ArchInterrupts._ports:
             self._uncached_interrupt_response_ports = (
                 self._uncached_interrupt_response_ports
-                + ["interrupts[0].int_responder"]
+                + [
+                    f"interrupts[{i}].int_responder"
+                    for i in range(self.numThreads)
+                ]
             )
         if "int_requestor" in self.ArchInterrupts._ports:
             self._uncached_interrupt_request_ports = (
                 self._uncached_interrupt_request_ports
-                + ["interrupts[0].int_requestor"]
+                + [
+                    f"interrupts[{i}].int_requestor"
+                    for i in range(self.numThreads)
+                ]
             )
diff --git a/src/cpu/FuncUnit.py b/src/cpu/FuncUnit.py
index cba3eda878..2244dd2ae4 100644
--- a/src/cpu/FuncUnit.py
+++ b/src/cpu/FuncUnit.py
@@ -100,25 +100,22 @@ class OpClass(Enum):
         "FloatMemWrite",
         "IprAccess",
         "InstPrefetch",
-        "VectorUnitStrideLoad",
-        "VectorUnitStrideStore",
-        "VectorUnitStrideMaskLoad",
-        "VectorUnitStrideMaskStore",
-        "VectorStridedLoad",
-        "VectorStridedStore",
-        "VectorIndexedLoad",
-        "VectorIndexedStore",
-        "VectorUnitStrideFaultOnlyFirstLoad",
-        "VectorWholeRegisterLoad",
-        "VectorWholeRegisterStore",
-        "VectorIntegerArith",
-        "VectorFloatArith",
-        "VectorFloatConvert",
-        "VectorIntegerReduce",
-        "VectorFloatReduce",
-        "VectorMisc",
-        "VectorIntegerExtension",
-        "VectorConfig",
+        "SimdUnitStrideLoad",
+        "SimdUnitStrideStore",
+        "SimdUnitStrideMaskLoad",
+        "SimdUnitStrideMaskStore",
+        "SimdStridedLoad",
+        "SimdStridedStore",
+        "SimdIndexedLoad",
+        "SimdIndexedStore",
+        "SimdWholeRegisterLoad",
+        "SimdWholeRegisterStore",
+        "SimdUnitStrideFaultOnlyFirstLoad",
+        "SimdUnitStrideSegmentedLoad",
+        "SimdUnitStrideSegmentedStore",
+        "SimdExt",
+        "SimdFloatExt",
+        "SimdConfig",
     ]
 
 
diff --git a/src/cpu/StaticInstFlags.py b/src/cpu/StaticInstFlags.py
index 4ab6cc499c..2e02833d1a 100644
--- a/src/cpu/StaticInstFlags.py
+++ b/src/cpu/StaticInstFlags.py
@@ -99,4 +99,5 @@ class StaticInstFlags(Enum):
         "IsHtmStart",  # Starts a HTM transaction
         "IsHtmStop",  # Stops (commits) a HTM transaction
         "IsHtmCancel",  # Explicitely aborts a HTM transaction
+        "IsInvalid",  # An invalid instruction
     ]
diff --git a/src/cpu/exetrace.cc b/src/cpu/exetrace.cc
index 2be3557d4b..89c9a66e74 100644
--- a/src/cpu/exetrace.cc
+++ b/src/cpu/exetrace.cc
@@ -115,10 +115,15 @@ ExeTracerRecord::traceInst(const StaticInstPtr &inst, bool ran)
         }
 
         if (debug::ExecResult && dataStatus != DataInvalid) {
-            if (dataStatus == DataReg)
-                ccprintf(outs, " D=%s", data.asReg.asString());
-            else
+            if (dataStatus == DataReg) {
+                if (vectorLengthInBytes > 0 && inst->isVector()) {
+                    outs << " D=" << data.asReg.asString(vectorLengthInBytes);
+                } else {
+                    ccprintf(outs, " D=%s", data.asReg.asString());
+                }
+            } else {
                 ccprintf(outs, " D=%#018x", data.asInt);
+            }
         }
 
         if (debug::ExecEffAddr && getMemValid())
diff --git a/src/cpu/exetrace.hh b/src/cpu/exetrace.hh
index 3fbeb98bc3..0b77795c5b 100644
--- a/src/cpu/exetrace.hh
+++ b/src/cpu/exetrace.hh
@@ -66,6 +66,7 @@ class ExeTracerRecord : public InstRecord
         : InstRecord(_when, _thread, _staticInst, _pc, _macroStaticInst),
           tracer(_tracer)
     {
+        vectorLengthInBytes = _thread->getIsaPtr()->getVectorLengthInBytes();
     }
 
     void traceInst(const StaticInstPtr &inst, bool ran);
@@ -74,6 +75,7 @@ class ExeTracerRecord : public InstRecord
 
   protected:
     const ExeTracer &tracer;
+    int64_t vectorLengthInBytes;
 };
 
 class ExeTracer : public InstTracer
diff --git a/src/cpu/inst_res.hh b/src/cpu/inst_res.hh
index 30bb2b262b..bfcf4fd461 100644
--- a/src/cpu/inst_res.hh
+++ b/src/cpu/inst_res.hh
@@ -176,6 +176,13 @@ class InstResult
             return _regClass->valString(&reg);
         }
     }
+
+    std::string
+    asString(const int64_t& num_bytes) const
+    {
+        assert(blob());
+        return _regClass->valString(getBlob(), num_bytes);
+    }
 };
 
 } // namespace gem5
diff --git a/src/cpu/kvm/Kconfig b/src/cpu/kvm/Kconfig
index 2b76e94360..4e0ec0c287 100644
--- a/src/cpu/kvm/Kconfig
+++ b/src/cpu/kvm/Kconfig
@@ -23,11 +23,16 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
+config HAVE_KVM
+    bool
+    default "$(HAVE_KVM)"
+
 config KVM_ISA
     string
     default "$(KVM_ISA)"
 
 config USE_KVM
-    depends on KVM_ISA != ""
+    depends on KVM_ISA != "" && HAVE_KVM
     bool "Enable hardware virtualized (KVM) CPU models"
     default y
diff --git a/src/cpu/kvm/SConsopts b/src/cpu/kvm/SConsopts
index 1363d60951..5024b1eba9 100644
--- a/src/cpu/kvm/SConsopts
+++ b/src/cpu/kvm/SConsopts
@@ -61,5 +61,8 @@ with gem5_scons.Configure(main) as conf:
             warning("perf_event headers lack support for the exclude_host "
                     "attribute. KVM instruction counts will be inaccurate.")
 
-if not main['CONF']['KVM_ISA']:
-    warning("Can not enable KVM, host seems to lack KVM support")
+def create_use_kvm_var():
+    if not (main['CONF']['HAVE_KVM'] and main['CONF']['KVM_ISA']):
+        warning("Cannot enable KVM, host seems to lack KVM support")
+
+AfterSConsopts(create_use_kvm_var)
diff --git a/src/cpu/minor/BaseMinorCPU.py b/src/cpu/minor/BaseMinorCPU.py
index 7110caac2c..545dfeaee5 100644
--- a/src/cpu/minor/BaseMinorCPU.py
+++ b/src/cpu/minor/BaseMinorCPU.py
@@ -219,6 +219,10 @@ class MinorDefaultFloatSimdFU(MinorFU):
             "Matrix",
             "MatrixMov",
             "MatrixOP",
+            "SimdExt",
+            "SimdFloatExt",
+            "SimdFloatCvt",
+            "SimdConfig",
         ]
     )
 
@@ -234,7 +238,23 @@ class MinorDefaultPredFU(MinorFU):
 
 class MinorDefaultMemFU(MinorFU):
     opClasses = minorMakeOpClassSet(
-        ["MemRead", "MemWrite", "FloatMemRead", "FloatMemWrite"]
+        [
+            "MemRead",
+            "MemWrite",
+            "FloatMemRead",
+            "FloatMemWrite",
+            "SimdUnitStrideLoad",
+            "SimdUnitStrideStore",
+            "SimdUnitStrideMaskLoad",
+            "SimdUnitStrideMaskStore",
+            "SimdStridedLoad",
+            "SimdStridedStore",
+            "SimdIndexedLoad",
+            "SimdIndexedStore",
+            "SimdUnitStrideFaultOnlyFirstLoad",
+            "SimdWholeRegisterLoad",
+            "SimdWholeRegisterStore",
+        ]
     )
     timings = [
         MinorFUTiming(
@@ -249,33 +269,6 @@ class MinorDefaultMiscFU(MinorFU):
     opLat = 1
 
 
-class MinorDefaultVecFU(MinorFU):
-    opClasses = minorMakeOpClassSet(
-        [
-            "VectorUnitStrideLoad",
-            "VectorUnitStrideStore",
-            "VectorUnitStrideMaskLoad",
-            "VectorUnitStrideMaskStore",
-            "VectorStridedLoad",
-            "VectorStridedStore",
-            "VectorIndexedLoad",
-            "VectorIndexedStore",
-            "VectorUnitStrideFaultOnlyFirstLoad",
-            "VectorWholeRegisterLoad",
-            "VectorWholeRegisterStore",
-            "VectorIntegerArith",
-            "VectorFloatArith",
-            "VectorFloatConvert",
-            "VectorIntegerReduce",
-            "VectorFloatReduce",
-            "VectorMisc",
-            "VectorIntegerExtension",
-            "VectorConfig",
-        ]
-    )
-    opLat = 1
-
-
 class MinorDefaultFUPool(MinorFUPool):
     funcUnits = [
         MinorDefaultIntFU(),
@@ -286,7 +279,6 @@ class MinorDefaultFUPool(MinorFUPool):
         MinorDefaultPredFU(),
         MinorDefaultMemFU(),
         MinorDefaultMiscFU(),
-        MinorDefaultVecFU(),
     ]
 
 
diff --git a/src/cpu/o3/FuncUnitConfig.py b/src/cpu/o3/FuncUnitConfig.py
index 617cef9749..ab01b4aa27 100644
--- a/src/cpu/o3/FuncUnitConfig.py
+++ b/src/cpu/o3/FuncUnitConfig.py
@@ -106,6 +106,9 @@ class SIMD_Unit(FUDesc):
         OpDesc(opClass="SimdReduceCmp"),
         OpDesc(opClass="SimdFloatReduceAdd"),
         OpDesc(opClass="SimdFloatReduceCmp"),
+        OpDesc(opClass="SimdExt"),
+        OpDesc(opClass="SimdFloatExt"),
+        OpDesc(opClass="SimdConfig"),
     ]
     count = 4
 
@@ -116,12 +119,29 @@ class PredALU(FUDesc):
 
 
 class ReadPort(FUDesc):
-    opList = [OpDesc(opClass="MemRead"), OpDesc(opClass="FloatMemRead")]
+    opList = [
+        OpDesc(opClass="MemRead"),
+        OpDesc(opClass="FloatMemRead"),
+        OpDesc(opClass="SimdUnitStrideLoad"),
+        OpDesc(opClass="SimdUnitStrideMaskLoad"),
+        OpDesc(opClass="SimdStridedLoad"),
+        OpDesc(opClass="SimdIndexedLoad"),
+        OpDesc(opClass="SimdUnitStrideFaultOnlyFirstLoad"),
+        OpDesc(opClass="SimdWholeRegisterLoad"),
+    ]
     count = 0
 
 
 class WritePort(FUDesc):
-    opList = [OpDesc(opClass="MemWrite"), OpDesc(opClass="FloatMemWrite")]
+    opList = [
+        OpDesc(opClass="MemWrite"),
+        OpDesc(opClass="FloatMemWrite"),
+        OpDesc(opClass="SimdUnitStrideStore"),
+        OpDesc(opClass="SimdUnitStrideMaskStore"),
+        OpDesc(opClass="SimdStridedStore"),
+        OpDesc(opClass="SimdIndexedStore"),
+        OpDesc(opClass="SimdWholeRegisterStore"),
+    ]
     count = 0
 
 
@@ -131,6 +151,17 @@ class RdWrPort(FUDesc):
         OpDesc(opClass="MemWrite"),
         OpDesc(opClass="FloatMemRead"),
         OpDesc(opClass="FloatMemWrite"),
+        OpDesc(opClass="SimdUnitStrideLoad"),
+        OpDesc(opClass="SimdUnitStrideStore"),
+        OpDesc(opClass="SimdUnitStrideMaskLoad"),
+        OpDesc(opClass="SimdUnitStrideMaskStore"),
+        OpDesc(opClass="SimdStridedLoad"),
+        OpDesc(opClass="SimdStridedStore"),
+        OpDesc(opClass="SimdIndexedLoad"),
+        OpDesc(opClass="SimdIndexedStore"),
+        OpDesc(opClass="SimdUnitStrideFaultOnlyFirstLoad"),
+        OpDesc(opClass="SimdWholeRegisterLoad"),
+        OpDesc(opClass="SimdWholeRegisterStore"),
     ]
     count = 4
 
diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc
index f5e601f679..84a92997a1 100644
--- a/src/cpu/o3/commit.cc
+++ b/src/cpu/o3/commit.cc
@@ -254,7 +254,7 @@ Commit::setActiveThreads(std::list<ThreadID> *at_ptr)
 }
 
 void
-Commit::setRenameMap(UnifiedRenameMap rm_ptr[MaxThreads])
+Commit::setRenameMap(UnifiedRenameMap::PerThreadUnifiedRenameMap& rm_ptr)
 {
     for (ThreadID tid = 0; tid < numThreads; tid++)
         renameMap[tid] = &rm_ptr[tid];
@@ -1403,6 +1403,24 @@ ThreadID
 Commit::getCommittingThread()
 {
     if (numThreads > 1) {
+        // If a thread is exiting, we need to ensure that *all* of its
+        // instructions will be retired in this cycle, because the
+        // thread will be removed from the CPU at the end of this cycle.
+        // To ensure this, we prioritize committing from exiting threads
+        // before we consider other threads using the specified SMT
+        // commit policy.
+        for (ThreadID tid : *activeThreads) {
+            if (cpu->isThreadExiting(tid) &&
+                !rob->isEmpty(tid) &&
+                (commitStatus[tid] == Running ||
+                 commitStatus[tid] == Idle ||
+                 commitStatus[tid] == FetchTrapPending)) {
+                assert(rob->isHeadReady(tid) &&
+                       rob->readHeadInst(tid)->isSquashed());
+                return tid;
+            }
+        }
+
         switch (commitPolicy) {
           case CommitPolicy::RoundRobin:
             return roundRobin();
diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh
index eccd023d45..4fff9fe892 100644
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -167,7 +167,7 @@ class Commit
     void setActiveThreads(std::list<ThreadID> *at_ptr);
 
     /** Sets pointer to the commited state rename map. */
-    void setRenameMap(UnifiedRenameMap rm_ptr[MaxThreads]);
+    void setRenameMap(UnifiedRenameMap::PerThreadUnifiedRenameMap& rm_ptr);
 
     /** Sets pointer to the ROB. */
     void setROB(ROB *rob_ptr);
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index a93d5bc74d..e161d88daa 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012, 2014, 2016, 2017, 2019-2020 ARM Limited
+ * Copyright (c) 2011-2012, 2014, 2016, 2017, 2019-2020, 2024 Arm Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -193,18 +193,36 @@ CPU::CPU(const BaseO3CPUParams &params)
     assert(numThreads);
     const auto &regClasses = params.isa[0]->regClasses();
 
-    assert(params.numPhysIntRegs >=
-            numThreads * regClasses.at(IntRegClass)->numRegs());
-    assert(params.numPhysFloatRegs >=
-            numThreads * regClasses.at(FloatRegClass)->numRegs());
-    assert(params.numPhysVecRegs >=
-            numThreads * regClasses.at(VecRegClass)->numRegs());
-    assert(params.numPhysVecPredRegs >=
-            numThreads * regClasses.at(VecPredRegClass)->numRegs());
-    assert(params.numPhysMatRegs >=
-            numThreads * regClasses.at(MatRegClass)->numRegs());
-    assert(params.numPhysCCRegs >=
-            numThreads * regClasses.at(CCRegClass)->numRegs());
+    panic_if(params.numPhysIntRegs <=
+            numThreads * regClasses.at(IntRegClass)->numRegs() &&
+            regClasses.at(IntRegClass)->numRegs() != 0,
+            "Not enough physical registers, consider increasing "
+            "numPhysIntRegs\n");
+    panic_if(params.numPhysFloatRegs <=
+            numThreads * regClasses.at(FloatRegClass)->numRegs() &&
+            regClasses.at(FloatRegClass)->numRegs() != 0,
+            "Not enough physical registers, consider increasing "
+            "numPhysFloatRegs\n");
+    panic_if(params.numPhysVecRegs <=
+            numThreads * regClasses.at(VecRegClass)->numRegs() &&
+            regClasses.at(VecRegClass)->numRegs() != 0,
+            "Not enough physical registers, consider increasing "
+            "numPhysVecRegs\n");
+    panic_if(params.numPhysVecPredRegs <=
+            numThreads * regClasses.at(VecPredRegClass)->numRegs() &&
+            regClasses.at(VecPredRegClass)->numRegs() != 0,
+            "Not enough physical registers, consider increasing "
+            "numPhysVecPredRegs\n");
+    panic_if(params.numPhysMatRegs <=
+            numThreads * regClasses.at(MatRegClass)->numRegs() &&
+            regClasses.at(MatRegClass)->numRegs() != 0,
+            "Not enough physical registers, consider increasing "
+            "numPhysMatRegs\n");
+    panic_if(params.numPhysCCRegs <=
+            numThreads * regClasses.at(CCRegClass)->numRegs() &&
+            regClasses.at(CCRegClass)->numRegs() != 0,
+            "Not enough physical registers, consider increasing "
+            "numPhysCCRegs\n");
 
     // Just make this a warning and go ahead anyway, to keep from having to
     // add checks everywhere.
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index 1d100ab330..cc238e532c 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -110,6 +110,9 @@ class CPU : public BaseCPU
     BaseMMU *mmu;
     using LSQRequest = LSQ::LSQRequest;
 
+    using PerThreadUnifiedRenameMap =
+        UnifiedRenameMap::PerThreadUnifiedRenameMap;
+
     /** Overall CPU status. */
     Status _status;
 
@@ -420,10 +423,10 @@ class CPU : public BaseCPU
     UnifiedFreeList freeList;
 
     /** The rename map. */
-    UnifiedRenameMap renameMap[MaxThreads];
+    PerThreadUnifiedRenameMap renameMap;
 
     /** The commit rename map. */
-    UnifiedRenameMap commitRenameMap[MaxThreads];
+    PerThreadUnifiedRenameMap commitRenameMap;
 
     /** The re-order buffer. */
     ROB rob;
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index 8cd84cbf05..452545e871 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -691,7 +691,8 @@ Fetch::doSquash(const PCStateBase &new_pc, const DynInstPtr squashInst,
 
     set(pc[tid], new_pc);
     fetchOffset[tid] = 0;
-    if (squashInst && squashInst->pcState().instAddr() == new_pc.instAddr())
+    if (squashInst && squashInst->pcState().instAddr() == new_pc.instAddr() &&
+        !squashInst->isLastMicroop())
         macroop[tid] = squashInst->macroop;
     else
         macroop[tid] = NULL;
diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc
index c20edc2e46..59a38aed45 100644
--- a/src/cpu/o3/rename.cc
+++ b/src/cpu/o3/rename.cc
@@ -286,7 +286,7 @@ Rename::setActiveThreads(std::list<ThreadID> *at_ptr)
 
 
 void
-Rename::setRenameMap(UnifiedRenameMap rm_ptr[MaxThreads])
+Rename::setRenameMap(UnifiedRenameMap::PerThreadUnifiedRenameMap& rm_ptr)
 {
     for (ThreadID tid = 0; tid < numThreads; tid++)
         renameMap[tid] = &rm_ptr[tid];
@@ -1225,9 +1225,6 @@ Rename::checkStall(ThreadID tid)
     } else if (calcFreeLQEntries(tid) <= 0 && calcFreeSQEntries(tid) <= 0) {
         DPRINTF(Rename,"[tid:%i] Stall: LSQ has 0 free entries.\n", tid);
         ret_val = true;
-    } else if (renameMap[tid]->numFreeEntries() <= 0) {
-        DPRINTF(Rename,"[tid:%i] Stall: RenameMap has 0 free entries.\n", tid);
-        ret_val = true;
     } else if (renameStatus[tid] == SerializeStall &&
                (!emptyROB[tid] || instsInProgress[tid])) {
         DPRINTF(Rename,"[tid:%i] Stall: Serialize stall and ROB is not "
@@ -1263,7 +1260,7 @@ Rename::readFreeEntries(ThreadID tid)
             freeEntries[tid].robEntries,
             freeEntries[tid].lqEntries,
             freeEntries[tid].sqEntries,
-            renameMap[tid]->numFreeEntries(),
+            renameMap[tid]->minFreeEntries(),
             renameMap[tid]->numFreeEntries(IntRegClass),
             renameMap[tid]->numFreeEntries(FloatRegClass),
             renameMap[tid]->numFreeEntries(VecRegClass),
diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh
index 81e63e5019..8daf3c76e5 100644
--- a/src/cpu/o3/rename.hh
+++ b/src/cpu/o3/rename.hh
@@ -170,7 +170,7 @@ class Rename
     void setActiveThreads(std::list<ThreadID> *at_ptr);
 
     /** Sets pointer to rename maps (per-thread structures). */
-    void setRenameMap(UnifiedRenameMap rm_ptr[MaxThreads]);
+    void setRenameMap(UnifiedRenameMap::PerThreadUnifiedRenameMap& rm_ptr);
 
     /** Sets pointer to the free list. */
     void setFreeList(UnifiedFreeList *fl_ptr);
diff --git a/src/cpu/o3/rename_map.hh b/src/cpu/o3/rename_map.hh
index 640c2acbba..6f650d2777 100644
--- a/src/cpu/o3/rename_map.hh
+++ b/src/cpu/o3/rename_map.hh
@@ -182,6 +182,8 @@ class UnifiedRenameMap
 
     typedef SimpleRenameMap::RenameInfo RenameInfo;
 
+    typedef std::array<UnifiedRenameMap, MaxThreads> PerThreadUnifiedRenameMap;
+
     /** Default constructor.  init() must be called prior to use. */
     UnifiedRenameMap() : regFile(nullptr) {};
 
@@ -266,7 +268,7 @@ class UnifiedRenameMap
      * of registers is requested.
      */
     unsigned
-    numFreeEntries() const
+    minFreeEntries() const
     {
         auto min_free = std::numeric_limits<unsigned>::max();
         for (auto &map: renameMaps) {
diff --git a/src/cpu/op_class.hh b/src/cpu/op_class.hh
index 0151df06a9..c8812ab36b 100644
--- a/src/cpu/op_class.hh
+++ b/src/cpu/op_class.hh
@@ -108,32 +108,30 @@ static const OpClass MemReadOp = enums::MemRead;
 static const OpClass MemWriteOp = enums::MemWrite;
 static const OpClass FloatMemReadOp = enums::FloatMemRead;
 static const OpClass FloatMemWriteOp = enums::FloatMemWrite;
+static const OpClass SimdUnitStrideLoadOp = enums::SimdUnitStrideLoad;
+static const OpClass SimdUnitStrideStoreOp = enums::SimdUnitStrideStore;
+static const OpClass SimdUnitStrideMaskLoadOp
+             = enums::SimdUnitStrideMaskLoad;
+static const OpClass SimdUnitStrideMaskStoreOp
+             = enums::SimdUnitStrideMaskStore;
+static const OpClass SimdStridedLoadOp = enums::SimdStridedLoad;
+static const OpClass SimdStridedStoreOp = enums::SimdStridedStore;
+static const OpClass SimdIndexedLoadOp = enums::SimdIndexedLoad;
+static const OpClass SimdIndexedStoreOp = enums::SimdIndexedStore;
+static const OpClass SimdUnitStrideFaultOnlyFirstLoadOp
+             = enums::SimdUnitStrideFaultOnlyFirstLoad;
+static const OpClass SimdWholeRegisterLoadOp
+             = enums::SimdWholeRegisterLoad;
+static const OpClass SimdWholeRegisterStoreOp
+             = enums::SimdWholeRegisterStore;
 static const OpClass IprAccessOp = enums::IprAccess;
 static const OpClass InstPrefetchOp = enums::InstPrefetch;
-static const OpClass VectorUnitStrideLoadOp = enums::VectorUnitStrideLoad;
-static const OpClass VectorUnitStrideStoreOp = enums::VectorUnitStrideStore;
-static const OpClass VectorUnitStrideMaskLoadOp
-             = enums::VectorUnitStrideMaskLoad;
-static const OpClass VectorUnitStrideMaskStoreOp
-             = enums::VectorUnitStrideMaskStore;
-static const OpClass VectorStridedLoadOp = enums::VectorStridedLoad;
-static const OpClass VectorStridedStoreOp = enums::VectorStridedStore;
-static const OpClass VectorIndexedLoadOp = enums::VectorIndexedLoad;
-static const OpClass VectorIndexedStoreOp = enums::VectorIndexedStore;
-static const OpClass VectorUnitStrideFaultOnlyFirstLoadOp
-             = enums::VectorUnitStrideFaultOnlyFirstLoad;
-static const OpClass VectorWholeRegisterLoadOp
-             = enums::VectorWholeRegisterLoad;
-static const OpClass VectorWholeRegisterStoreOp
-             = enums::VectorWholeRegisterStore;
-static const OpClass VectorIntegerArithOp = enums::VectorIntegerArith;
-static const OpClass VectorFloatArithOp = enums::VectorFloatArith;
-static const OpClass VectorFloatConvertOp = enums::VectorFloatConvert;
-static const OpClass VectorIntegerReduceOp = enums::VectorIntegerReduce;
-static const OpClass VectorFloatReduceOp = enums::VectorFloatReduce;
-static const OpClass VectorMiscOp = enums::VectorMisc;
-static const OpClass VectorIntegerExtensionOp = enums::VectorIntegerExtension;
-static const OpClass VectorConfigOp = enums::VectorConfig;
+static const OpClass SimdUnitStrideSegmentedLoadOp = enums::SimdUnitStrideSegmentedLoad;
+static const OpClass SimdUnitStrideSegmentedStoreOp
+             = enums::SimdUnitStrideSegmentedStore;
+static const OpClass SimdExtOp = enums::SimdExt;
+static const OpClass SimdFloatExtOp = enums::SimdFloatExt;
+static const OpClass SimdConfigOp = enums::SimdConfig;
 static const OpClass Num_OpClasses = enums::Num_OpClass;
 
 } // namespace gem5
diff --git a/src/cpu/pred/simple_indirect.hh b/src/cpu/pred/simple_indirect.hh
index efc7c8b0d6..1106d9dc3a 100644
--- a/src/cpu/pred/simple_indirect.hh
+++ b/src/cpu/pred/simple_indirect.hh
@@ -160,7 +160,8 @@ class SimpleIndirectPredictor : public IndirectPredictor
 
     inline bool isIndirectNoReturn(BranchType type) {
         return (type == BranchType::CallIndirect) ||
-               (type == BranchType::IndirectUncond);
+               (type == BranchType::IndirectUncond) ||
+               (type == BranchType::IndirectCond);
     }
 
   protected:
diff --git a/src/cpu/reg_class.cc b/src/cpu/reg_class.cc
index 12d1c7f2e0..a72156195c 100644
--- a/src/cpu/reg_class.cc
+++ b/src/cpu/reg_class.cc
@@ -56,7 +56,7 @@ RegClassOps::regName(const RegId &id) const
 }
 
 std::string
-RegClassOps::valString(const void *val, size_t size) const
+RegClassOps::valString(const void *val, const size_t& size) const
 {
     // If this is just a RegVal, or could be interpreted as one, print it
     // that way.
diff --git a/src/cpu/reg_class.hh b/src/cpu/reg_class.hh
index 37618e530a..5a297e424f 100644
--- a/src/cpu/reg_class.hh
+++ b/src/cpu/reg_class.hh
@@ -45,6 +45,7 @@
 #include <iterator>
 #include <string>
 
+#include "arch/generic/vec_reg.hh"
 #include "base/cprintf.hh"
 #include "base/debug.hh"
 #include "base/intmath.hh"
@@ -170,7 +171,7 @@ class RegClassOps
     /** Print the name of the register specified in id. */
     virtual std::string regName(const RegId &id) const;
     /** Print the value of a register pointed to by val of size size. */
-    virtual std::string valString(const void *val, size_t size) const;
+    virtual std::string valString(const void *val, const size_t& size) const;
     /** Flatten register id id using information in the ISA object isa. */
     virtual RegId
     flatten(const BaseISA &isa, const RegId &id) const
@@ -246,6 +247,11 @@ class RegClass
     {
         return _ops->valString(val, regBytes());
     }
+    std::string
+    valString(const void *val, const size_t& num_bytes) const
+    {
+        return _ops->valString(val, std::min(regBytes(), num_bytes));
+    }
     RegId
     flatten(const BaseISA &isa, const RegId &id) const
     {
@@ -354,15 +360,30 @@ RegClass::operator[](RegIndex idx) const
     return RegId(*this, idx);
 }
 
+// Type matching for gem5::VecRegContainer class
+// This is used in TypedRegClassOps.
+template<typename>
+struct is_vec_reg_container : std::false_type {};
+template<std::size_t SIZE>
+struct is_vec_reg_container<gem5::VecRegContainer<SIZE>> : std::true_type {};
+
 template <typename ValueType>
 class TypedRegClassOps : public RegClassOps
 {
   public:
     std::string
-    valString(const void *val, size_t size) const override
+    valString(const void *val, const size_t& size) const override
     {
-        assert(size == sizeof(ValueType));
-        return csprintf("%s", *(const ValueType *)val);
+        if constexpr (is_vec_reg_container<ValueType>::value) {
+            if (size == sizeof(ValueType)) {
+                return csprintf("%s", *(const ValueType *)val);
+            } else {
+                return ((const ValueType *)val)->getString(size);
+            }
+        } else {
+            assert(size == sizeof(ValueType));
+            return csprintf("%s", *(const ValueType *)val);
+        }
     }
 };
 
diff --git a/src/cpu/static_inst.hh b/src/cpu/static_inst.hh
index 12b05f9b0e..78e47f4ed8 100644
--- a/src/cpu/static_inst.hh
+++ b/src/cpu/static_inst.hh
@@ -196,6 +196,8 @@ class StaticInst : public RefCounted, public StaticInstFlags
     bool isHtmStop() const { return flags[IsHtmStop]; }
     bool isHtmCancel() const { return flags[IsHtmCancel]; }
 
+    bool isInvalid() const { return flags[IsInvalid]; }
+
     bool
     isHtmCmd() const
     {
diff --git a/src/cpu/testers/gpu_ruby_test/README b/src/cpu/testers/gpu_ruby_test/README
index 00e4c8e781..db7230a402 100644
--- a/src/cpu/testers/gpu_ruby_test/README
+++ b/src/cpu/testers/gpu_ruby_test/README
@@ -38,11 +38,11 @@ assumes tested protocols supports release consistency.
 To start using the tester quickly, you can use the following example command
 line to get running immediately:
 
-build/GCN3_X86/gem5.opt configs/example/ruby_gpu_random_test.py \
+build/VEGA_X86/gem5.opt configs/example/ruby_gpu_random_test.py \
             --test-length=1000 --system-size=medium --cache-size=small
 
 An overview of the main command line options is as follows. For all options
-use `build/GCN3_X86/gem5.opt configs/example/ruby_gpu_random_test.py --help`
+use `build/VEGA_X86/gem5.opt configs/example/ruby_gpu_random_test.py --help`
 or see the configuration file.
 
  * --cache-size (small, large): Use smaller sizes for testing evict, etc.
diff --git a/src/cpu/testers/spatter_gen/SConscript b/src/cpu/testers/spatter_gen/SConscript
new file mode 100644
index 0000000000..86231409dd
--- /dev/null
+++ b/src/cpu/testers/spatter_gen/SConscript
@@ -0,0 +1,38 @@
+# Copyright (c) 2024 The Regents of The University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import("*")
+
+SimObject(
+    "SpatterGen.py",
+    sim_objects=["SpatterGen"],
+    enums=["SpatterKernelType", "SpatterProcessingMode"],
+)
+
+Source("spatter_gen.cc")
+
+DebugFlag("SpatterGen")
+DebugFlag("SpatterKernel")
diff --git a/src/cpu/testers/spatter_gen/SpatterGen.py b/src/cpu/testers/spatter_gen/SpatterGen.py
new file mode 100644
index 0000000000..1c88f867ca
--- /dev/null
+++ b/src/cpu/testers/spatter_gen/SpatterGen.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2024 The Regents of The University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.citations import add_citation
+from m5.objects.ClockedObject import ClockedObject
+from m5.params import *
+from m5.proxy import *
+from m5.util.pybind import PyBindMethod
+
+
+class SpatterKernelType(Enum):
+    vals = ["scatter", "gather"]
+
+
+class SpatterProcessingMode(Enum):
+    vals = ["synchronous", "asynchronous"]
+
+
+class SpatterGen(ClockedObject):
+    type = "SpatterGen"
+    cxx_header = "cpu/testers/spatter_gen/spatter_gen.hh"
+    cxx_class = "gem5::SpatterGen"
+
+    system = Param.System(Parent.any, "System this SpatterGen is a part of.")
+
+    processing_mode = Param.SpatterProcessingMode(
+        "How to process kernels accross multiple SpatterGen cores. "
+        "Whether to synchronize on kernel boundaries or not."
+    )
+
+    port = RequestPort("Port to send memory requests.")
+
+    int_regfile_size = Param.Int("Size of the integer register file.")
+    fp_regfile_size = Param.Int("Size of the floating point register file.")
+    request_gen_latency = Param.Cycles(
+        "Number of cycles to spend for creating a request."
+    )
+    request_gen_rate = Param.Int("Number of requests generate per cycle.")
+    request_buffer_entries = Param.Int("Size of the request buffer.")
+    send_rate = Param.Int(
+        "Number of requests to send in parallel."
+        "Emulates the number of dcache ports."
+    )
+
+    cxx_exports = [
+        PyBindMethod("addKernel"),
+        PyBindMethod("proceedPastSyncPoint"),
+    ]
+
+
+add_citation(
+    SpatterGen,
+    """@inproceedings{10.1145/3422575.3422794,
+author = {Lavin, Patrick and Young, Jeffrey and Vuduc, Richard and Riedy,
+Jason and Vose, Aaron and Ernst, Daniel},
+title = {Evaluating Gather and Scatter Performance on CPUs and GPUs},
+year = {2021},
+isbn = {9781450388993},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3422575.3422794},
+doi = {10.1145/3422575.3422794},
+abstract = {This paper describes a new benchmark tool,
+Spatter, for assessing memory system architectures in the context of a
+specific category of indexed accesses known as gather and scatter.
+These types of operations are increasingly used to express sparse and
+irregular data access patterns, and they have widespread utility in many
+modern HPC applications including scientific simulations, data mining and
+analysis computations, and graph processing. However, many traditional
+benchmarking tools like STREAM, STRIDE, and GUPS focus on characterizing
+only uniform stride or fully random accesses despite evidence that modern
+applications use varied sets of more complex access patterns. Spatter is an
+open-source benchmark that provides a tunable and configurable framework to
+benchmark a variety of indexed access patterns, including variations of gather
+/ scatter that are seen in HPC mini-apps evaluated in this work. The design of
+Spatter includes backends for OpenMP and CUDA, and experiments show how it can
+be used to evaluate 1) uniform access patterns for CPU and GPU, 2) prefetching
+regimes for gather / scatter, 3) compiler implementations of vectorization for
+gather / scatter, and 4) trace-driven “proxy patterns” that reflect the
+patterns found in multiple applications. The results from Spatter experiments
+show, for instance, that GPUs typically outperform CPUs for these operations
+in absolute bandwidth but not fraction of peak bandwidth, and that Spatter can
+better represent the performance of some cache-dependent mini-apps than
+traditional STREAM bandwidth measurements.},
+booktitle = {Proceedings of the International Symposium on Memory Systems},
+pages = {209–222},
+numpages = {14},
+location = {Washington, DC, USA},
+series = {MEMSYS '20}
+}
+""",
+)
diff --git a/src/cpu/testers/spatter_gen/spatter_gen.cc b/src/cpu/testers/spatter_gen/spatter_gen.cc
new file mode 100644
index 0000000000..fb9e6427dd
--- /dev/null
+++ b/src/cpu/testers/spatter_gen/spatter_gen.cc
@@ -0,0 +1,584 @@
+/*
+* Copyright (c) 2024 The Regents of The University of California
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met: redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer;
+* redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the distribution;
+* neither the name of the copyright holders nor the names of its
+* contributors may be used to endorse or promote products derived from
+* this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cpu/testers/spatter_gen/spatter_gen.hh"
+
+#include "base/cprintf.hh"
+#include "debug/SpatterGen.hh"
+#include "debug/SpatterKernel.hh"
+#include "enums/SpatterKernelType.hh"
+#include "enums/SpatterProcessingMode.hh"
+#include "mem/packet.hh"
+#include "sim/sim_exit.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+using enums::SpatterKernelTypeStrings;
+using enums::SpatterProcessingMode;
+
+SpatterGen::SpatterGen(const Params& params):
+    ClockedObject(params),
+    state(SpatterGenState::RUNNING),
+    requestorId(params.system->getRequestorId(this)),
+    numPendingMemRequests(0),
+    stats(this),
+    mode(params.processing_mode),
+    port(this, name() + ".port"),
+    intRegFileSize(params.int_regfile_size), intRegUsed(0),
+    fpRegFileSize(params.fp_regfile_size), fpRegUsed(0),
+    requestGenLatency(params.request_gen_latency),
+    requestGenRate(params.request_gen_rate),
+    firstGeneratorAvailableTime(0),
+    nextGenEvent([this](){ processNextGenEvent(); }, name() + ".GenEvent"),
+    requestBufferEntries(params.request_buffer_entries),
+    requestBuffer(clockPeriod()),
+    sendRate(params.send_rate),
+    firstPortAvailableTime(0),
+    nextSendEvent([this](){ processNextSendEvent(); }, name() + ".SendEvent"),
+    receiveBuffer(clockPeriod())
+{
+    fatal_if(fpRegFileSize < requestBufferEntries,
+            "fp_regfile_size should be >= request_buffer_entries."
+            "if request_buffer_entries is bigger than fp_regfile_size,"
+            "it may result in inaccuracies in your simulation."
+            "Ideally: fp_regfile_size >> request_buffer_entries."
+    );
+    generatorBusyUntil.resize(requestGenRate, 0);
+    portBusyUntil.resize(sendRate, 0);
+}
+
+Port&
+SpatterGen::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "port") {
+        return port;
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+SpatterGen::startup()
+{
+    scheduleNextGenEvent(curTick());
+}
+
+void
+SpatterGen::SpatterGenPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blocked(), "Should never try to send if port is blocked.");
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        DPRINTF(
+            SpatterGen,
+            "%s: Port blocked when sending %s.\n",
+            __func__, pkt->print()
+        );
+    }
+}
+
+void
+SpatterGen::SpatterGenPort::recvReqRetry()
+{
+    DPRINTF(SpatterGen, "%s: Port received a ReqRetry.\n", __func__);
+    panic_if(
+            blockedPacket == nullptr,
+            "Received reqRetry with no blocked packet."
+            );
+    if (!sendTimingReq(blockedPacket)) {
+        DPRINTF(
+            SpatterGen,
+            "%s: Port blocked when sending %s.\n",
+            __func__, blockedPacket->print()
+        );
+    } else {
+        blockedPacket = nullptr;
+        owner->recvReqRetry();
+    }
+}
+
+void
+SpatterGen::recvReqRetry()
+{
+    if (nextSendEvent.pending()) {
+        nextSendEvent.wake();
+        scheduleNextSendEvent(nextCycle());
+    }
+}
+
+bool
+SpatterGen::SpatterGenPort::recvTimingResp(PacketPtr pkt) {
+    return owner->recvTimingResp(pkt);
+}
+
+bool
+SpatterGen::recvTimingResp(PacketPtr pkt)
+{
+    DPRINTF(SpatterGen, "%s: Received pkt: %s.\n", __func__, pkt->print());
+    assert(pkt->isResponse());
+
+    // record trip time.
+    SpatterAccess* spatter_access = pkt->findNextSenderState<SpatterAccess>();
+    Tick trip_time = (curTick() - requestDepartureTime[pkt->req]);
+    requestDepartureTime.erase(pkt->req);
+    spatter_access->recordTripTime(trip_time);
+
+    int trips_left = spatter_access->tripsLeft();
+    assert(trips_left >= 0);
+    if (trips_left > 0) {
+        stats.numIndexReads++;
+        stats.indexBytesRead += pkt->getSize();
+        stats.totalIndexReadLatency += trip_time;
+
+        stats.indexAccessLatency.sample(trip_time);
+        receiveBuffer.push(spatter_access, curTick());
+    } else {
+        stats.valueAccessLatency.sample(trip_time);
+        stats.totalIndirectAccessLatency.sample(
+                                            spatter_access->tripTimeSoFar()
+                                            );
+        if (spatter_access->type() == SpatterKernelType::gather) {
+            stats.numValueReads++;
+            stats.valueBytesRead += pkt->getSize();
+            stats.totalValueReadLatency += trip_time;
+        } else if (spatter_access->type() == SpatterKernelType::scatter) {
+            stats.numValueWrites++;
+            stats.valueBytesWritten += pkt->getSize();
+            stats.totalValueWriteLatency += trip_time;
+        } else {
+            panic("Unknown kernel type.");
+        }
+        // CAUTION: We're going to decrement fpRegUsed here,
+        // it could cause inaccuracies if processNextGenEvent
+        // is called after recvTimingResp on the same tick.
+        // i.e. we might end up releasing a register on the same
+        // cycle that we are allocating it.
+        // it's probably not going to ever be an issue since
+        // fpRegFileSize is probably >> requestBufferEntries
+        // i.e. the chances of running out of fp registers is low because
+        // we do not simulate parts of the pipeline that back things up into
+        // fp registers, e.g. functional units of ALU.
+        fpRegUsed--;
+        delete spatter_access;
+    }
+
+    // delete the pkt since we don't need it anymore.
+    delete pkt;
+
+    if (!nextGenEvent.pending()) {
+        scheduleNextGenEvent(nextCycle());
+    }
+
+    numPendingMemRequests--;
+    checkForSimExit();
+    return true;
+}
+
+void
+SpatterGen::addKernel(
+    uint32_t id, uint32_t delta, uint32_t count,
+    SpatterKernelType type,
+    uint32_t base_index, uint32_t indices_per_stride, uint32_t stride,
+    size_t index_size, Addr base_index_addr,
+    size_t value_size, Addr base_value_addr,
+    const std::vector<uint32_t>& indices
+)
+{
+    DPRINTF(
+        SpatterGen,
+        "%s: Adding kernel with id: %d, delta: %d, count: %d, type: %s.\n",
+        __func__, id, delta, count, SpatterKernelTypeStrings[type]
+    );
+    SpatterKernel new_kernel(
+                            requestorId,
+                            id, delta, count, type,
+                            base_index, indices_per_stride, stride,
+                            index_size, base_index_addr,
+                            value_size, base_value_addr
+                            );
+    new_kernel.setIndices(indices);
+    kernels.push(new_kernel);
+}
+
+void
+SpatterGen::proceedPastSyncPoint()
+{
+    assert(mode == SpatterProcessingMode::synchronous);
+    assert(state == SpatterGenState::WAITING);
+    state = SpatterGenState::RUNNING;
+    scheduleNextGenEvent(nextCycle());
+}
+
+void
+SpatterGen::checkForSimExit()
+{
+    bool no_pending = numPendingMemRequests == 0;
+    bool no_queued = requestBuffer.empty();
+    int avail_int_regs = intRegFileSize - intRegUsed;
+    int avail_fp_regs = fpRegFileSize - fpRegUsed;
+    bool can_do_init = initAccessOk(avail_int_regs, avail_fp_regs, curTick());
+    bool can_do_mid = interAccessOk(avail_int_regs, avail_fp_regs, curTick());
+    bool can_do_ult = ultAccessOk(avail_int_regs, avail_fp_regs, curTick());
+    if (!can_do_init && !can_do_mid && !can_do_ult && no_pending && no_queued)
+    {
+        assert((
+                (mode == SpatterProcessingMode::synchronous) &&
+                (state == SpatterGenState::DRAINING)
+                ) ||
+                mode == SpatterProcessingMode::asynchronous
+            );
+        state = SpatterGenState::WAITING;
+        exitSimLoop(
+            csprintf("%s received all expected responses.", name()),
+            0,
+            nextCycle()
+        );
+    }
+}
+
+bool
+SpatterGen::initAccessOk(int int_regs, int fp_regs, Tick when) const
+{
+    bool have_int_reg = int_regs > 0;
+    // for mode == SpatterProcessingMode::asynchronous state will always be
+    // SpatterGenState::RUNNING. we don't have to do checks for mode.
+    // for mode == SpatterProcessingMode::synchronous, if state is
+    // SpatterGenState::DRAINING or SpatterGenState::WAITING
+    // we can't initiate any new indirect accesses.
+    bool have_kernel = !kernels.empty() && (state == SpatterGenState::RUNNING);
+    return have_kernel && have_int_reg;
+}
+
+bool
+SpatterGen::interAccessOk(int int_regs, int fp_regs, Tick when) const
+{
+    bool have_int_reg = int_regs > 0;
+    bool have_index = receiveBuffer.hasReady(when);
+    bool mid_idx = have_index && (receiveBuffer.front()->tripsLeft() > 1);
+    return mid_idx && have_int_reg;
+}
+
+bool
+SpatterGen::ultAccessOk(int int_regs, int fp_regs, Tick when) const
+{
+    bool have_fp_reg = fp_regs > 0;
+    bool have_index = receiveBuffer.hasReady(when);
+    bool val_idx = have_index && (receiveBuffer.front()->tripsLeft() == 1);
+    return val_idx && have_fp_reg;
+}
+
+void
+SpatterGen::scheduleNextGenEvent(Tick when)
+{
+    int avail_int_regs = intRegFileSize - intRegUsed;
+    int avail_fp_regs = fpRegFileSize - fpRegUsed;
+    bool have_work = initAccessOk(avail_int_regs, avail_fp_regs, curTick()) ||
+                    interAccessOk(avail_int_regs, avail_fp_regs, curTick()) ||
+                    ultAccessOk(avail_int_regs, avail_fp_regs, curTick());
+    Tick schedule_tick = std::max(when, firstGeneratorAvailableTime);
+    if (have_work && (!nextGenEvent.scheduled())) {
+        schedule(nextGenEvent, schedule_tick);
+        firstGeneratorAvailableTime = MaxTick;
+    }
+}
+
+void
+SpatterGen::processNextGenEvent()
+{
+    assert(!nextGenEvent.pending());
+    int req_buf_before = requestBuffer.size();
+    // track changes to intRegUsed in this variable and apply it
+    // at the end of the for loop. This way if we free a register
+    // in the for loop, other iterations of the for loop won't
+    // observe this change. This matches what happens in real h/w.
+    int int_used_now = 0;
+    // track this independently to prevent different iterations inside
+    // for loop observing change to h/w resources, i.e we can't rely
+    // intRegFileSize - intRegUsed to see if we have registers to allocate
+    // since they don't change until after the for loop
+    int int_regs_now = intRegFileSize - intRegUsed;
+    // same explanation as int_used_now
+    int fp_used_now = 0;
+    // same explanation as int_regs_now
+    int fp_regs_now = fpRegFileSize - fpRegUsed;
+    for (int i = 0; i < requestGenRate; i++) {
+        if (generatorBusyUntil[i] > curTick()) {
+            DPRINTF(
+                SpatterGen,
+                "%s: AGU[%d] is busy this cycle.\n", __func__, i
+            );
+            continue;
+        }
+        if (!(requestBuffer.size() < requestBufferEntries)) {
+            // if no space left in the requestBuffer sleep
+            // whoever pops from requestBuffer wakes us up.
+            nextGenEvent.sleep();
+            break;
+        }
+        // Now we know that AGU[i] is available and there is room
+        // in the requestBuffer to put the packet.
+        if (ultAccessOk(int_regs_now, fp_regs_now, curTick())) {
+            // occupy one fp register
+            fp_regs_now--;
+            fp_used_now++;
+            // make AGU busy for the next requestGenLatency cycles.
+            generatorBusyUntil[i] = clockEdge(Cycles(requestGenLatency));
+
+            // create a new packet to access
+            SpatterAccess* spatter_access = receiveBuffer.front();
+            PacketPtr pkt = spatter_access->nextPacket();
+            pkt->pushSenderState(spatter_access);
+
+            // push to requestBuffer
+            requestBuffer.push(pkt, curTick());
+            DPRINTF(
+                SpatterGen,
+                "%s: Pushed pkt: %s to requestBuffer.\n",
+                __func__, pkt->print()
+            );
+
+            // now deallocate resources for reading the index
+            int_used_now--;
+            receiveBuffer.pop();
+        } else if (interAccessOk(int_regs_now, fp_regs_now, curTick())) {
+            // occupy one int register
+            int_regs_now--;
+            int_used_now++;
+            // make AGU busy for the next requestGenLatency cycles.
+            generatorBusyUntil[i] = clockEdge(Cycles(requestGenLatency));
+
+            // create a new packet to access
+            SpatterAccess* spatter_access = receiveBuffer.front();
+            PacketPtr pkt = spatter_access->nextPacket();
+            pkt->pushSenderState(spatter_access);
+
+            // push to requestBuffer
+            requestBuffer.push(pkt, curTick());
+            DPRINTF(
+                SpatterGen,
+                "%s: Pushed pkt: %s to requestBuffer.\n",
+                __func__, pkt->print()
+            );
+
+            // now deallocate resources for reading the index
+            int_used_now--;
+            receiveBuffer.pop();
+        } else if (initAccessOk(int_regs_now, fp_regs_now, curTick())) {
+            // occupy one int register
+            int_regs_now--;
+            int_used_now++;
+            generatorBusyUntil[i] = clockEdge(Cycles(requestGenLatency));
+
+            SpatterKernel& front = kernels.front();
+            SpatterAccess* spatter_access = front.nextSpatterAccess();
+            PacketPtr pkt = spatter_access->nextPacket();
+            pkt->pushSenderState(spatter_access);
+
+            requestBuffer.push(pkt, curTick());
+            DPRINTF(
+                SpatterGen,
+                "%s: Pushed pkt: %s to requestBuffer.\n",
+                __func__, pkt->print()
+            );
+
+            if (front.done()) {
+                DPRINTF(
+                    SpatterKernel,
+                    "%s: Done with kernel %d type: %s.\n",
+                    __func__, front.id(),
+                    SpatterKernelTypeStrings[front.type()]
+                );
+                kernels.pop();
+                // If we're processing synchronously we now have to stop
+                // making intial accesses and wait everyone to receive
+                // all expected responses.
+                if (mode == SpatterProcessingMode::synchronous) {
+                    state = SpatterGenState::DRAINING;
+                }
+            }
+        } else {
+            //
+            DPRINTF(
+                SpatterGen,
+                "%s: Nothing more could be done this cycle.\n", __func__
+                );
+            DPRINTF(SpatterGen, "%s: Here is h/w status report: "
+                "{KERNELS_REMAIN: %d, INDEXES_REMAIN: %d, INT_REG_USED: %d, "
+                "FP_REG_USED: %d, REQ_BUFF_SIZE: %d}.\n",
+                __func__, kernels.size(), receiveBuffer.size(),
+                intRegUsed, fpRegUsed, requestBuffer.size());
+            break;
+        }
+    }
+
+    // update firstGeneratorAvailableTime after making all changes.
+    for (int i = 0; i < requestGenRate; i++) {
+        generatorBusyUntil[i] = std::max(generatorBusyUntil[i], nextCycle());
+        firstGeneratorAvailableTime = std::min(
+                                            firstGeneratorAvailableTime,
+                                            generatorBusyUntil[i]
+                                            );
+    }
+
+    // now that we have simulated all the work of this cycle, we can
+    // apply the deltas to the h/w resources.
+    intRegUsed += int_used_now;
+    fpRegUsed += fp_used_now;
+
+    bool did_work = (requestBuffer.size() - req_buf_before) > 0;
+    if (did_work && (!nextSendEvent.pending())) {
+        scheduleNextSendEvent(nextCycle());
+    }
+
+    if (!nextGenEvent.pending()) {
+        scheduleNextGenEvent(firstGeneratorAvailableTime);
+    }
+}
+
+void
+SpatterGen::scheduleNextSendEvent(Tick when)
+{
+    bool have_work = !requestBuffer.empty();
+    Tick schedule_tick = std::max(when, firstPortAvailableTime);
+    if (have_work && (!nextSendEvent.scheduled())) {
+        schedule(nextSendEvent, schedule_tick);
+        firstPortAvailableTime = MaxTick;
+    }
+}
+
+void
+SpatterGen::processNextSendEvent()
+{
+    int req_buf_before = requestBuffer.size();
+    for (int i = 0; i < sendRate; i++) {
+        if (portBusyUntil[i] > curTick()) {
+            DPRINTF(
+                SpatterGen,
+                "%s: Port[%d] is busy this cycle.\n", __func__, i
+            );
+            continue;
+        }
+        if (requestBuffer.empty()) {
+            DPRINTF(
+                SpatterGen,
+                "%s: No packets to send this cycle.\n", __func__
+            );
+            break;
+        }
+        if (!requestBuffer.hasReady(curTick())) {
+            DPRINTF(
+                SpatterGen,
+                "%s: Packet at front of requestBuffer not ready this cycle.\n",
+                __func__
+            );
+            break;
+        }
+        PacketPtr pkt = requestBuffer.front();
+        DPRINTF(
+            SpatterGen,
+            "%s: Sending pkt: %s to port[%d].\n",
+            __func__, pkt->print(), i
+        );
+        // NOTE: We assume the port will be busy for 1 cycle.
+        portBusyUntil[i] = clockEdge(Cycles(1));
+        port.sendPacket(pkt);
+        requestBuffer.pop();
+        // increase numPendingMemRequests
+        numPendingMemRequests++;
+        // record packet departure time
+        requestDepartureTime[pkt->req] = curTick();
+        // Now if we put the port in blocked state no point in continuing
+        // the loop. also no point in scheduling nextSendEvent.
+        if (port.blocked()) {
+            nextSendEvent.sleep();
+            break;
+        }
+    }
+    // update firstPortAvailableTime after making all changes.
+    for (int i = 0; i < sendRate; i++) {
+        // if the port was not used this cycle, it's busy until nextCycle().
+        portBusyUntil[i] = std::max(portBusyUntil[i], nextCycle());
+        firstPortAvailableTime = std::min(
+                                        firstPortAvailableTime,
+                                        portBusyUntil[i]
+                                        );
+    }
+
+    bool did_work = (req_buf_before - requestBuffer.size()) > 0;
+    if (did_work && nextGenEvent.pending()) {
+        // since this event might open up space for output of nextGenEvent,
+        // it should wake it up if nextGenEvent is asleep.
+        nextGenEvent.wake();
+        scheduleNextGenEvent(nextCycle());
+    }
+
+    if (!nextSendEvent.pending()) {
+        scheduleNextSendEvent(nextCycle());
+    }
+}
+
+SpatterGen::SpatterGenStats::SpatterGenStats(SpatterGen* spatter_gen):
+    statistics::Group(spatter_gen), spatterGen(spatter_gen),
+    ADD_STAT(numIndexReads, statistics::units::Count::get(),
+        "Number of reads from the indexer array."),
+    ADD_STAT(indexBytesRead, statistics::units::Byte::get(),
+        "Number of bytes read from the indexer array."),
+    ADD_STAT(totalIndexReadLatency, statistics::units::Tick::get(),
+        "Total latency for reading from the indexer array."),
+    ADD_STAT(numValueReads, statistics::units::Count::get(),
+        "Number of reads from the values array."),
+    ADD_STAT(numValueWrites, statistics::units::Count::get(),
+        "Number of writes to the values array."),
+    ADD_STAT(valueBytesRead, statistics::units::Byte::get(),
+        "Number of bytes read from the values array."),
+    ADD_STAT(valueBytesWritten, statistics::units::Byte::get(),
+        "Number of bytes written to the values array."),
+    ADD_STAT(totalValueReadLatency, statistics::units::Tick::get(),
+        "Total latency for reading from the values array."),
+    ADD_STAT(totalValueWriteLatency, statistics::units::Tick::get(),
+        "Total latency for writing to the values array."),
+    ADD_STAT(indexAccessLatency, statistics::units::Tick::get(),
+        "Distribution of latency for accessing the indexer array."),
+    ADD_STAT(valueAccessLatency, statistics::units::Tick::get(),
+        "Distribution of latency for accessing the values array."),
+    ADD_STAT(totalIndirectAccessLatency, statistics::units::Tick::get(),
+        "Distribution of total latency for indirect accesses.")
+{}
+
+void
+SpatterGen::SpatterGenStats::regStats()
+{
+    using namespace statistics;
+    indexAccessLatency.init(8);
+    valueAccessLatency.init(16);
+    totalIndirectAccessLatency.init(16);
+}
+
+} // namespace gem5
diff --git a/src/cpu/testers/spatter_gen/spatter_gen.hh b/src/cpu/testers/spatter_gen/spatter_gen.hh
new file mode 100644
index 0000000000..b296ab8934
--- /dev/null
+++ b/src/cpu/testers/spatter_gen/spatter_gen.hh
@@ -0,0 +1,252 @@
+/*
+* Copyright (c) 2024 The Regents of The University of California
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met: redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer;
+* redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the distribution;
+* neither the name of the copyright holders nor the names of its
+* contributors may be used to endorse or promote products derived from
+* this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __CPU_TESTERS_SPATTER_GEN_SPATTER_GEN_HH__
+#define __CPU_TESTERS_SPATTER_GEN_SPATTER_GEN_HH__
+
+#include <queue>
+#include <unordered_map>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "base/stats/group.hh"
+#include "cpu/testers/spatter_gen/utility_structs.hh"
+#include "enums/SpatterKernelType.hh"
+#include "enums/SpatterProcessingMode.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/SpatterGen.hh"
+#include "sim/clocked_object.hh"
+#include "sim/eventq.hh"
+
+namespace gem5
+{
+
+
+/**
+ * @class SpatterGen
+ * @brief Spatter Kernel Player
+ *
+ * This class takes Spatter JSON traces and plays them back in gem5.
+ * Each trace includes a list of Spatter kernels, which are played in order.
+ * Kernels are either of type scatter or gather.
+ * At the time of writing, kernels represent accesses to the memory with
+ * one level of indirection.
+ * Initially, an access is made to an array which we call index from now on.
+ * The index array is streamed through with load accesses.
+ * In a high level programming language this access will be similar to below.
+ * "for (int i = 0; i < n; i++) { idx = index[i]; }".
+ * The value at index[i] is then used to access another array which we will
+ * call value from now on.
+ * For scatter type kernels, a random value is stored in the location and
+ * for gather type kernels, the value is read from the location.
+ * In a high level programming language this access will be similar to below.
+ * Scatter
+ * "for (int i = 0; i < n; i++) { idx = index[i]; value[idx] = rand(); }".
+ * Gather
+ * "for (int i = 0; i < n; i++) { idx = index[i]; val = value[idx]; }".
+ * For more information you can take a look at
+ * https://github.com/hpcgarage/spatter/blob/main/README.md
+ * While the readme mentions MultiScatter and MultiGather kernels, the
+ * trace format is not finalized (at the time of writing).
+ */
+class SpatterGen: public ClockedObject
+{
+  private:
+    typedef enums::SpatterKernelType SpatterKernelType;
+    typedef enums::SpatterProcessingMode SpatterProcessingMode;
+
+    class SpatterGenEvent : public EventFunctionWrapper
+    {
+      private:
+        // TODO: split pending into pendingInput and pendingOutput
+        enum class SleepState
+        {
+            AWAKE,
+            ASLEEP
+        };
+
+        SleepState _state;
+
+      public:
+        SpatterGenEvent(const std::function<void(void)> &callback,
+                    const std::string &name):
+            EventFunctionWrapper(callback, name), _state(SleepState::AWAKE)
+        {}
+        // a SpatterGenEvent will only be asleep if it is pending output
+        bool pending() const { return _state == SleepState::ASLEEP; }
+        void sleep() { _state = SleepState::ASLEEP; }
+        void wake() { _state = SleepState::AWAKE; }
+    };
+
+    class SpatterGenPort: public RequestPort
+    {
+      private:
+        SpatterGen* owner;
+        PacketPtr blockedPacket;
+
+      public:
+        SpatterGenPort(SpatterGen* owner, const std::string& name):
+            RequestPort(name), owner(owner), blockedPacket(nullptr) {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() const { return blockedPacket != nullptr; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt) override;
+        virtual void recvReqRetry() override;
+    };
+
+    struct SpatterGenStats: public statistics::Group
+    {
+        SpatterGen* spatterGen;
+
+        // TODO: When we enable multiple levels of indirection, we should
+        // convert this to a vector with one stat for each level of index
+        statistics::Scalar numIndexReads;
+        // TODO: When we enable multiple levels of indirection, we should
+        // convert this to a vector with one stat for each level of index
+        statistics::Scalar indexBytesRead;
+        statistics::Scalar totalIndexReadLatency;
+
+        statistics::Scalar numValueReads;
+        statistics::Scalar numValueWrites;
+        statistics::Scalar valueBytesRead;
+        statistics::Scalar valueBytesWritten;
+        statistics::Scalar totalValueReadLatency;
+        statistics::Scalar totalValueWriteLatency;
+
+        // TODO: When we enable multiple levels of indirection, we should
+        // convert this to a vector with one stat for each level of index
+        statistics::Histogram indexAccessLatency;
+        statistics::Histogram valueAccessLatency;
+        statistics::Histogram totalIndirectAccessLatency;
+
+        virtual void regStats() override;
+
+        SpatterGenStats(SpatterGen* spatter_gen);
+    };
+
+    enum class SpatterGenState
+    {
+        // waiting for all other cores to get to WAITING state, no accesses
+        WAITING,
+        // only creating intermediate and ultimate accesses, i.e. wrapping up
+        DRAINING,
+        // creating all kinds of accesses, initial, intermediate, and ultimate
+        RUNNING
+    };
+
+    // non param related members
+    SpatterGenState state;
+    std::queue<SpatterKernel> kernels;
+    std::unordered_map<RequestPtr, Tick> requestDepartureTime;
+
+    RequestorID requestorId;
+    int numPendingMemRequests;
+
+    SpatterGenStats stats;
+
+    void checkForSimExit();
+
+    bool initAccessOk(int int_regs, int fp_regs, Tick when) const;
+    bool interAccessOk(int int_regs, int fp_regs, Tick when) const;
+    bool ultAccessOk(int int_regs, int fp_regs, Tick when) const;
+
+    // param related members (not necessarily one-to-one with params)
+    SpatterProcessingMode mode;
+    SpatterGenPort port;
+    // size of the register files,
+    // for every memory instruction we need to allocate one register.
+    int intRegFileSize;
+    int intRegUsed;
+    int fpRegFileSize;
+    int fpRegUsed;
+    // laterncy to generate A request
+    int requestGenLatency;
+    // number of requests generated per event
+    int requestGenRate;
+    // tracking smallest tick when at least one "AGU" is available;
+    Tick firstGeneratorAvailableTime;
+    // tracking the busy state of our so called "AGU"s.
+    std::vector<Tick> generatorBusyUntil;
+    SpatterGenEvent nextGenEvent;
+    void processNextGenEvent();
+    // put requests to the cache in the request buffer.
+    int requestBufferEntries;
+    // store request packet along with their insertion time into this queue.
+    TimedQueue<PacketPtr> requestBuffer;
+    // if nextGenEvent has to be schedule at tick when then schedule it.
+    // this function should only be called when nextGenEvent is not pending.
+    void scheduleNextGenEvent(Tick when);
+
+    // bandwidth to issue memory requests to cache,
+    // this is supposed to model the number of cache ports
+    // we will assume it takes 1 cycle to issue memory requests
+    int sendRate;
+    Tick firstPortAvailableTime;
+    std::vector<Tick> portBusyUntil;
+    SpatterGenEvent nextSendEvent;
+    void processNextSendEvent();
+    // if nextSendEvent has to be schedule at tick when then schedule it.
+    // this function should only be called when nextSendEvent is not pending.
+    void scheduleNextSendEvent(Tick when);
+
+    // put the memory responses here.
+    // no need to limit the size of this buffer.
+    // it's a response buffer and it will automatically
+    // be limited by requestBufferEntries, intRegFileSize, fpRegFileSize
+    TimedQueue<SpatterAccess*> receiveBuffer;
+
+  public:
+    PARAMS(SpatterGen);
+    SpatterGen(const Params& params);
+
+    Port&
+    getPort(const std::string& if_name, PortID idx = InvalidPortID) override;
+
+    virtual void startup() override;
+
+    void recvReqRetry();
+    bool recvTimingResp(PacketPtr pkt);
+    // PyBindMethod to interface adding a kernel with python JSON frontend.
+    void addKernel(
+        uint32_t id, uint32_t delta, uint32_t count,
+        SpatterKernelType type,
+        uint32_t base_index, uint32_t indices_per_stride, uint32_t stride,
+        size_t index_size, Addr base_index_addr,
+        size_t value_size, Addr base_value_addr,
+        const std::vector<uint32_t>& indices
+    );
+
+    void proceedPastSyncPoint();
+};
+
+} // namespace gem5
+
+#endif // __CPU_TESTERS_SPATTER_GEN_SPATTER_GEN_HH__
diff --git a/src/cpu/testers/spatter_gen/utility_structs.hh b/src/cpu/testers/spatter_gen/utility_structs.hh
new file mode 100644
index 0000000000..d64cd481c5
--- /dev/null
+++ b/src/cpu/testers/spatter_gen/utility_structs.hh
@@ -0,0 +1,273 @@
+/*
+* Copyright (c) 2024 The Regents of The University of California
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met: redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer;
+* redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the distribution;
+* neither the name of the copyright holders nor the names of its
+* contributors may be used to endorse or promote products derived from
+* this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __CPU_TESTERS_SPATTER_GEN_UTILITY_STRUCTS_HH__
+#define __CPU_TESTERS_SPATTER_GEN_UTILITY_STRUCTS_HH__
+
+#include <deque>
+#include <queue>
+
+#include "base/random.hh"
+#include "base/types.hh"
+#include "enums/SpatterKernelType.hh"
+#include "mem/packet.hh"
+
+namespace gem5
+{
+
+template<typename T>
+class TimedQueue
+{
+  private:
+    Tick latency;
+
+    std::queue<T> items;
+    std::queue<Tick> insertionTimes;
+
+  public:
+    TimedQueue(Tick latency): latency(latency) {}
+
+    void push(T item, Tick insertion_time)
+    {
+        items.push(item);
+        insertionTimes.push(insertion_time);
+    }
+
+    void pop()
+    {
+        items.pop();
+        insertionTimes.pop();
+    }
+
+    T front() const { return items.front(); }
+
+    bool empty() const { return items.empty(); }
+
+    size_t size() const { return items.size(); }
+
+    bool hasReady(Tick current_time) const
+    {
+        if (empty()) {
+            return false;
+        }
+        return (current_time - insertionTimes.front()) >= latency;
+    }
+};
+
+
+
+// Represents a single access to a SpatterKernel.
+// It supports multiple levels of indirection.
+// However, the SpatterKernel class only works with one level of
+// indirection (i.e. accessing value[index[i]]).
+struct SpatterAccess : public Packet::SenderState
+{
+    typedef std::tuple<Addr, size_t> AccessPair;
+    typedef enums::SpatterKernelType SpatterKernelType;
+
+    RequestorID requestorId;
+    SpatterKernelType _kernelType;
+    Tick accTripTime;
+    std::queue<AccessPair> accessPairs;
+
+    SpatterAccess(
+        RequestorID requestor_id,
+        SpatterKernelType kernel_type,
+        const std::queue<AccessPair>& access_pairs
+    ):
+        requestorId(requestor_id), _kernelType(kernel_type),
+        accTripTime(0), accessPairs(access_pairs)
+    {}
+
+    SpatterKernelType type() const { return _kernelType; }
+
+    int tripsLeft() const { return accessPairs.size(); }
+
+    void recordTripTime(Tick trip_time) { accTripTime += trip_time; }
+
+    Tick tripTimeSoFar() const { return accTripTime; }
+
+    AccessPair nextAccessPair()
+    {
+        assert(tripsLeft() > 0);
+        AccessPair access_pair = accessPairs.front();
+        accessPairs.pop();
+        return access_pair;
+    }
+
+    PacketPtr nextPacket()
+    {
+        Addr addr;
+        size_t size;
+        std::tie(addr, size) = nextAccessPair();
+        MemCmd cmd;
+        if (tripsLeft() >= 1){
+            cmd = MemCmd::ReadReq;
+        } else {
+            cmd = _kernelType == \
+                SpatterKernelType::gather ? MemCmd::ReadReq : MemCmd::WriteReq;
+        }
+        return createPacket(addr, size, cmd);
+    }
+
+    PacketPtr createPacket(Addr addr, size_t size, MemCmd cmd) const
+    {
+        RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
+
+        // Dummy PC to have PC-based prefetchers latch on;
+        // get entropy into higher bits
+        // This piece of code is directly copied from
+        // gem5::TrafficGen::
+        req->setPC(((Addr) requestorId) << 2);
+        PacketPtr pkt = new Packet(req, cmd);
+        uint8_t* pkt_data = new uint8_t[req->getSize()];
+        // Randomly intialize pkt_data, for testing cache coherence.
+        for (int i = 0; i < req->getSize(); i++) {
+            pkt_data[i] = random_mt.random<uint8_t>();
+        }
+        pkt->dataDynamic(pkt_data);
+        return pkt;
+    }
+};
+
+class SpatterKernel
+{
+  private:
+    typedef enums::SpatterKernelType SpatterKernelType;
+    typedef SpatterAccess::AccessPair AccessPair;
+
+    class IndexGen
+    {
+      private:
+        uint32_t indicesPerStride;
+        uint32_t stride;
+
+        uint32_t next;
+      public:
+        IndexGen(): indicesPerStride(0), stride(0), next(0)
+        {}
+
+        IndexGen(uint32_t base_index,
+                uint32_t indices_per_stride,
+                uint32_t stride_size):
+            indicesPerStride(indices_per_stride),
+            stride(stride_size), next(base_index)
+        {}
+
+        uint32_t nextIndex() {
+            uint32_t ret = next;
+            // update next index
+            next++;
+            if (next % indicesPerStride == 0) {
+                next += (stride - indicesPerStride);
+            }
+            return ret;
+        }
+    };
+
+    RequestorID requestorId;
+    IndexGen indexGen;
+
+    uint32_t _id;
+    uint32_t delta;
+    uint32_t count;
+
+    SpatterKernelType _type;
+
+    size_t indexSize;
+    Addr baseIndexAddr;
+
+    size_t valueSize;
+    Addr baseValueAddr;
+
+    // current iteration over indices
+    uint32_t iteration;
+
+    // number of times we have left to roll indices to finish one iteration.
+    uint32_t remRolls;
+    std::deque<uint32_t> indices;
+
+  public:
+
+    SpatterKernel(
+        RequestorID requestor_id,
+        uint32_t id, uint32_t delta, uint32_t count,
+        SpatterKernelType type,
+        uint32_t base_index, uint32_t indices_per_stride, uint32_t stride,
+        size_t index_size, Addr base_index_addr,
+        size_t value_size, Addr base_value_addr
+    ):
+        requestorId(requestor_id),
+        indexGen(base_index, indices_per_stride, stride),
+        _id(id), delta(delta), count(count),
+        _type(type),
+        indexSize(index_size), baseIndexAddr(base_index_addr),
+        valueSize(value_size), baseValueAddr(base_value_addr),
+        iteration(0), remRolls(0)
+    {}
+
+    uint32_t id() const { return _id; }
+
+    void setIndices(const std::vector<uint32_t>& pattern)
+    {
+        indices.assign(pattern.begin(), pattern.end());
+        remRolls = indices.size();
+    }
+
+    SpatterKernelType type() const { return _type; }
+
+    bool done() const { return iteration == count; }
+
+    SpatterAccess* nextSpatterAccess()
+    {
+        std::queue<AccessPair> access_pairs;
+        // get the next index for the index array
+        uint32_t index = indexGen.nextIndex();
+        Addr index_addr = baseIndexAddr + (index * indexSize);
+        access_pairs.emplace(index_addr, indexSize);
+
+        uint32_t front = indices.front();
+        uint32_t value_index = (delta * iteration) + front;
+        Addr value_addr = baseValueAddr + (value_index * valueSize);
+        access_pairs.emplace(value_addr, valueSize);
+        // roll indices
+        indices.pop_front();
+        indices.push_back(front);
+        remRolls--;
+        if (remRolls == 0) {
+            remRolls = indices.size();
+            iteration++;
+        }
+
+        return new SpatterAccess(requestorId, _type, access_pairs);
+    }
+};
+
+} // namespace gem5
+
+#endif // __CPU_TESTERS_SPATTER_GEN_UTILITY_STRUCTS_HH__
diff --git a/src/cpu/testers/traffic_gen/base.cc b/src/cpu/testers/traffic_gen/base.cc
index 03cf2a939b..be02e6e377 100644
--- a/src/cpu/testers/traffic_gen/base.cc
+++ b/src/cpu/testers/traffic_gen/base.cc
@@ -527,19 +527,20 @@ BaseTrafficGen::createNvm(Tick duration,
 }
 
 std::shared_ptr<BaseGen>
-BaseTrafficGen::createStrided(Tick duration,
-                             Addr start_addr, Addr end_addr, Addr blocksize,
-                             Addr stride_size, int gen_id,
-                             Tick min_period, Tick max_period,
-                             uint8_t read_percent, Addr data_limit)
+BaseTrafficGen::createStrided(
+        Tick duration,
+        Addr start_addr, Addr end_addr, Addr offset,
+        Addr block_size, Addr superblock_size, Addr stride_size,
+        Tick min_period, Tick max_period,
+        uint8_t read_percent, Addr data_limit)
 {
-    return std::shared_ptr<BaseGen>(new StridedGen(*this, requestorId,
-                                                  duration, start_addr,
-                                                  end_addr, blocksize,
-                                                  system->cacheLineSize(),
-                                                  stride_size, gen_id,
-                                                  min_period, max_period,
-                                                  read_percent, data_limit));
+    return std::shared_ptr<BaseGen>(new StridedGen(
+                                    *this, requestorId, duration,
+                                    system->cacheLineSize(),
+                                    start_addr, end_addr, offset,
+                                    block_size, superblock_size, stride_size,
+                                    min_period, max_period,
+                                    read_percent, data_limit));
 }
 
 std::shared_ptr<BaseGen>
diff --git a/src/cpu/testers/traffic_gen/base.hh b/src/cpu/testers/traffic_gen/base.hh
index 530da6d718..274e33570c 100644
--- a/src/cpu/testers/traffic_gen/base.hh
+++ b/src/cpu/testers/traffic_gen/base.hh
@@ -320,8 +320,8 @@ class BaseTrafficGen : public ClockedObject
 
     std::shared_ptr<BaseGen> createStrided(
         Tick duration,
-        Addr start_addr, Addr end_addr, Addr blocksize,
-        Addr stride_size, int gen_id,
+        Addr start_addr, Addr end_addr, Addr offset,
+        Addr block_size, Addr superblock_size, Addr stride_size,
         Tick min_period, Tick max_period,
         uint8_t read_percent, Addr data_limit);
 
diff --git a/src/cpu/testers/traffic_gen/strided_gen.cc b/src/cpu/testers/traffic_gen/strided_gen.cc
index 7823b93ebe..073bac4f1a 100644
--- a/src/cpu/testers/traffic_gen/strided_gen.cc
+++ b/src/cpu/testers/traffic_gen/strided_gen.cc
@@ -46,11 +46,28 @@
 namespace gem5
 {
 
+StridedGen::StridedGen(SimObject& obj, RequestorID requestor_id,
+        Tick duration, Addr cacheline_size,
+        Addr start_addr, Addr end_addr, Addr offset,
+        Addr block_size, Addr superblock_size, Addr stride_size,
+        Tick min_period, Tick max_period,
+        uint8_t read_percent, Addr data_limit)
+    : StochasticGen(obj, requestor_id, duration, start_addr, end_addr,
+                    block_size, cacheline_size, min_period, max_period,
+                    read_percent, data_limit),
+    offset(offset), superblockSize(superblock_size), strideSize(stride_size),
+    nextAddr(0), dataManipulated(0)
+{
+    assert(superblock_size % block_size == 0);
+    assert(offset % superblock_size == 0);
+    assert(stride_size % superblock_size == 0);
+}
+
 void
 StridedGen::enter()
 {
     // reset the address and the data counter
-    nextAddr = startAddr + genID * blocksize;
+    nextAddr = startAddr + offset;
     dataManipulated = 0;
 }
 
@@ -74,14 +91,20 @@ StridedGen::getNextPacket()
                               isRead ? MemCmd::ReadReq : MemCmd::WriteReq);
 
     // increment the address
-    nextAddr += strideSize;
+    nextAddr += blocksize;
+
+    // if we have completed reading a block we need to jump
+    // (strideSize - blockSize) bytes to start reading the next block
+    if ((nextAddr - (startAddr + offset)) % superblockSize == 0) {
+        nextAddr += (strideSize - superblockSize);
+    }
 
     // If we have reached the end of the address space, reset the
     // address to the start of the range
     if (nextAddr > endAddr) {
         DPRINTF(TrafficGen, "Wrapping address to the start of "
                 "the range\n");
-        nextAddr = startAddr + genID * blocksize;
+        nextAddr = startAddr + offset;
     }
 
     return pkt;
diff --git a/src/cpu/testers/traffic_gen/strided_gen.hh b/src/cpu/testers/traffic_gen/strided_gen.hh
index 4e3ad80cec..7770f70c45 100644
--- a/src/cpu/testers/traffic_gen/strided_gen.hh
+++ b/src/cpu/testers/traffic_gen/strided_gen.hh
@@ -71,35 +71,26 @@ class StridedGen : public StochasticGen
      *
      * @param obj SimObject owning this sequence generator
      * @param requestor_id RequestorID related to the memory requests
-     * @param _duration duration of this state before transitioning
+     * @param cacheline_size cache line size in the system
+     * @param duration duration of this state before transitioning
      * @param start_addr Start address
      * @param end_addr End address
-     * @param _blocksize Size used for transactions injected
-     * @param cacheline_size cache line size in the system
+     * @param offset The offset to start_addr for generating addresses.
+     * First generated address = start_addr + offset.
+     * @param block_size Size used for transactions injected
      * @param stride_size The strided size for consecutive requests
-     * @param gen_id The order of traffic generator in a list of strided \
-     * traffic generators, this param is used to offset the start address of \
-     * each generator accordingly with others.
+     * @param superblock_size Number of bytes to read before taking a stride
      * @param min_period Lower limit of random inter-transaction time
      * @param max_period Upper limit of random inter-transaction time
      * @param read_percent Percent of transactions that are reads
      * @param data_limit Upper limit on how much data to read/write
      */
-    StridedGen(SimObject &obj,
-              RequestorID requestor_id, Tick _duration,
-              Addr start_addr, Addr end_addr,
-              Addr _blocksize, Addr cacheline_size,
-              Addr stride_size, int gen_id,
-              Tick min_period, Tick max_period,
-              uint8_t read_percent, Addr data_limit)
-        : StochasticGen(obj, requestor_id, _duration, start_addr, end_addr,
-                        _blocksize, cacheline_size, min_period, max_period,
-                        read_percent, data_limit),
-          nextAddr(0),
-          dataManipulated(0),
-          strideSize(stride_size),
-          genID(gen_id)
-    { }
+    StridedGen(SimObject& obj, RequestorID requestor_id,
+            Tick duration, Addr cacheline_size,
+            Addr start_addr, Addr end_addr, Addr offset,
+            Addr block_size, Addr superblock_size, Addr stride_size,
+            Tick min_period, Tick max_period,
+            uint8_t read_percent, Addr data_limit);
 
     void enter();
 
@@ -108,6 +99,12 @@ class StridedGen : public StochasticGen
     Tick nextPacketTick(bool elastic, Tick delay) const;
 
   private:
+
+    Addr offset;
+    Addr superblockSize;
+    /* The size of the access stride */
+    Addr strideSize;
+
     /** Address of next request */
     Addr nextAddr;
 
@@ -117,17 +114,6 @@ class StridedGen : public StochasticGen
      * generating requests.
      */
     Addr dataManipulated;
-
-    /* The size by which consequent requests are separated */
-    Addr strideSize;
-
-    /**
-     * This param is used to indicate the order of a traffic
-     * generator among a set of traffic generators, then it
-     * is used to calculate the start address separately for
-     * each traffic generator in a list of generators.
-     */
-    int genID;
 };
 
 } // namespace gem5
diff --git a/src/cpu/testers/traffic_gen/traffic_gen.cc b/src/cpu/testers/traffic_gen/traffic_gen.cc
index 6eab4600ac..51afd3c9fd 100644
--- a/src/cpu/testers/traffic_gen/traffic_gen.cc
+++ b/src/cpu/testers/traffic_gen/traffic_gen.cc
@@ -287,6 +287,35 @@ TrafficGen::parseConfig()
                             DPRINTF(TrafficGen, "State: %d NvmGen\n", id);
                         }
                     }
+                } else if (mode == "STRIDED") {
+                    uint32_t read_percent;
+                    Addr start_addr;
+                    Addr end_addr;
+                    Addr offset;
+                    Addr blocksize;
+                    Addr superblock_size;
+                    Addr stride_size;
+                    Tick min_period;
+                    Tick max_period;
+                    Addr data_limit;
+
+                    is >> read_percent >> start_addr >> end_addr >> offset >>
+                        blocksize >> superblock_size >> stride_size >>
+                        min_period >> max_period >> data_limit;
+
+                    DPRINTF(TrafficGen, "%s, addr %x to %x with offset %x, "
+                            "size %d, super size %d, stride size %d, "
+                            "period %d to %d, %d%% reads\n",
+                            mode, start_addr, end_addr, offset, blocksize,
+                            superblock_size, stride_size, min_period,
+                            max_period, read_percent);
+
+                    states[id] = createStrided(
+                        duration, start_addr, end_addr, offset,
+                        blocksize, superblock_size, stride_size,
+                        min_period, max_period, read_percent, data_limit);
+
+                    DPRINTF(TrafficGen, "State: %d StridedGen\n", id);
                 } else {
                     fatal("%s: Unknown traffic generator mode: %s",
                           name(), mode);
diff --git a/src/dev/amdgpu/AMDGPU.py b/src/dev/amdgpu/AMDGPU.py
index 0370f09e01..0e0f597927 100644
--- a/src/dev/amdgpu/AMDGPU.py
+++ b/src/dev/amdgpu/AMDGPU.py
@@ -95,7 +95,7 @@ class AMDGPUDevice(PciDevice):
     # The config script should not create a new cp here but rather assign the
     # same cp that is assigned to the Shader SimObject.
     cp = Param.GPUCommandProcessor(NULL, "Command Processor")
-    pm4_pkt_proc = Param.PM4PacketProcessor("PM4 Packet Processor")
+    pm4_pkt_procs = VectorParam.PM4PacketProcessor("PM4 Packet Processor")
     memory_manager = Param.AMDGPUMemoryManager("GPU Memory Manager")
     memories = VectorParam.AbstractMemory([], "All memories in the device")
     device_ih = Param.AMDGPUInterruptHandler("GPU Interrupt handler")
@@ -118,6 +118,10 @@ class PM4PacketProcessor(DmaVirtDevice):
     cxx_header = "dev/amdgpu/pm4_packet_processor.hh"
     cxx_class = "gem5::PM4PacketProcessor"
 
+    # Default to 0 as the common case is one PM4 packet processor
+    ip_id = Param.Int(0, "Instance ID of this PM4 processor")
+    mmio_range = Param.AddrRange("Range of MMIO addresses")
+
 
 class AMDGPUMemoryManager(ClockedObject):
     type = "AMDGPUMemoryManager"
diff --git a/src/dev/amdgpu/amdgpu_defines.hh b/src/dev/amdgpu/amdgpu_defines.hh
index bc6377fbbc..883501b84d 100644
--- a/src/dev/amdgpu/amdgpu_defines.hh
+++ b/src/dev/amdgpu/amdgpu_defines.hh
@@ -49,6 +49,16 @@ enum QueueType
     RLC
 };
 
+/*
+ * Hold information about doorbells including queue type and the IP
+ * block ID if the IP can have multiple instances.
+ */
+typedef struct
+{
+    QueueType qtype;
+    int ip_id;
+} DoorbellInfo;
+
 // AMD GPUs support 16 different virtual address spaces
 static constexpr int AMDGPU_VM_COUNT = 16;
 
@@ -61,36 +71,11 @@ constexpr int MMIO_BAR = 5;
 constexpr uint32_t VGA_ROM_DEFAULT = 0xc0000;
 constexpr uint32_t ROM_SIZE = 0x20000;        // 128kB
 
-/* SDMA base, size, mmio offset shift. */
-static constexpr uint32_t SDMA0_BASE  = 0x4980;
-static constexpr uint32_t SDMA1_BASE  = 0x5180;
-static constexpr uint32_t SDMA_SIZE  = 0x800;
-static constexpr uint32_t SDMA_OFFSET_SHIFT  = 2;
-
-/* Interrupt handler base, size, mmio offset shift. */
-static constexpr uint32_t IH_BASE = 0x4280;
-static constexpr uint32_t IH_SIZE = 0x700;
+/* Most MMIOs use DWORD addresses and thus need to be shifted. */
 static constexpr uint32_t IH_OFFSET_SHIFT = 2;
-
-/* Graphics register bus manager base, size, mmio offset shift. */
-static constexpr uint32_t GRBM_BASE  = 0x8000;
-static constexpr uint32_t GRBM_SIZE  = 0x5000;
 static constexpr uint32_t GRBM_OFFSET_SHIFT  = 2;
-
-/* GFX base, size, mmio offset shift. */
-static constexpr uint32_t GFX_BASE  = 0x28000;
-static constexpr uint32_t GFX_SIZE  = 0x17000;
-static constexpr uint32_t GFX_OFFSET_SHIFT  = 2;
-
-/* MMHUB base, size, mmio offset shift. */
-static constexpr uint32_t MMHUB_BASE = 0x68000;
-static constexpr uint32_t MMHUB_SIZE = 0x2120;
 static constexpr uint32_t MMHUB_OFFSET_SHIFT = 2;
 
-/* NBIO base and size. */
-static constexpr uint32_t NBIO_BASE = 0x0;
-static constexpr uint32_t NBIO_SIZE = 0x4280;
-
 } // namespace gem5
 
 #endif // __DEV_AMDGPU_AMDGPU_DEFINES_HH__
diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index 48f450c2b2..fc977a2de0 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -54,8 +54,7 @@ namespace gem5
 
 AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
     : PciDevice(p), gpuMemMgr(p.memory_manager), deviceIH(p.device_ih),
-      pm4PktProc(p.pm4_pkt_proc), cp(p.cp),
-      checkpoint_before_mmios(p.checkpoint_before_mmios),
+      cp(p.cp), checkpoint_before_mmios(p.checkpoint_before_mmios),
       init_interrupt_count(0), _lastVMID(0),
       deviceMem(name() + ".deviceMem", p.memories, false, "", false)
 {
@@ -81,6 +80,18 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         romRange = RangeSize(VGA_ROM_DEFAULT, ROM_SIZE);
     }
 
+    if (p.device_name == "Vega10") {
+        gfx_version = GfxVersion::gfx900;
+    } else if (p.device_name == "MI100") {
+        gfx_version = GfxVersion::gfx908;
+    } else if (p.device_name == "MI200") {
+        gfx_version = GfxVersion::gfx90a;
+    } else if (p.device_name == "MI300X") {
+        gfx_version = GfxVersion::gfx942;
+    } else {
+        panic("Unknown GPU device %s\n", p.device_name);
+    }
+
     if (p.trace_file != "") {
         mmioReader.readMMIOTrace(p.trace_file);
     }
@@ -115,7 +126,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
         sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
         sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
-    } else if (p.device_name == "MI100" || p.device_name == "MI200") {
+    } else if (p.device_name == "MI100" || p.device_name == "MI200"
+            || p.device_name == "MI300X") {
         sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
         sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
         sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
@@ -126,15 +138,47 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         panic("Unknown GPU device %s\n", p.device_name);
     }
 
+    // Setup PM4 packet processors and sanity check IDs
+    std::set<int> pm4_ids;
+    for (auto& pm4 : p.pm4_pkt_procs) {
+        pm4->setGPUDevice(this);
+        fatal_if(pm4_ids.count(pm4->getIpId()),
+                "Two PM4s with same IP IDs is not allowed");
+        pm4_ids.insert(pm4->getIpId());
+        pm4PktProcs.insert({pm4->getIpId(), pm4});
+
+        pm4Ranges.insert({pm4->getMMIORange(), pm4});
+    }
+
+    // There should be at least one PM4 packet processor with ID 0
+    fatal_if(!pm4PktProcs.count(0), "No default PM4 processor found");
+
     deviceIH->setGPUDevice(this);
-    pm4PktProc->setGPUDevice(this);
     cp->hsaPacketProc().setGPUDevice(this);
     cp->setGPUDevice(this);
+    nbio.setGPUDevice(this);
 
     // Address aperture for device memory. We tell this to the driver and
     // could possibly be anything, but these are the values used by hardware.
     uint64_t mmhubBase = 0x8000ULL << 24;
     uint64_t mmhubTop = 0x83ffULL << 24;
+    uint64_t mem_size = 0x3ff0; // 16 GB of memory
+
+    gpuvm.setMMHUBBase(mmhubBase);
+    gpuvm.setMMHUBTop(mmhubTop);
+
+    // Map other MMIO apertures based on gfx version. This must be done before
+    // any calls to get/setRegVal.
+    // NBIO               0x0     - 0x4280
+    // IH                 0x4280  - 0x4980
+    // GRBM               0x8000  - 0xC000
+    // GFX                0x28000 - 0x3F000
+    // MMHUB              0x68000 - 0x6a120
+    gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280));
+    gpuvm.setMMIOAperture(IH_MMIO_RANGE,   AddrRange(0x4280, 0x4980));
+    gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000));
+    gpuvm.setMMIOAperture(GFX_MMIO_RANGE,  AddrRange(0x28000, 0x3F000));
+    gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE,  AddrRange(0x68000, 0x6A120));
 
     // These are hardcoded register values to return what the driver expects
     setRegVal(AMDGPU_MP0_SMN_C2PMSG_33, 0x80000000);
@@ -144,27 +188,23 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
     if (p.device_name == "Vega10") {
         setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24);
         setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24);
-        gfx_version = GfxVersion::gfx900;
     } else if (p.device_name == "MI100") {
         setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24);
         setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24);
-        setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory
-        gfx_version = GfxVersion::gfx908;
+        setRegVal(MI100_MEM_SIZE_REG, mem_size);
     } else if (p.device_name == "MI200") {
         // This device can have either 64GB or 128GB of device memory.
         // This limits to 16GB for simulation.
         setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
         setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
-        setRegVal(MI200_MEM_SIZE_REG, 0x3ff0);
-        gfx_version = GfxVersion::gfx90a;
+        setRegVal(MI200_MEM_SIZE_REG, mem_size);
+    } else if (p.device_name == "MI300X") {
+        setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
+        setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
+        setRegVal(MI200_MEM_SIZE_REG, mem_size);
     } else {
         panic("Unknown GPU device %s\n", p.device_name);
     }
-
-    gpuvm.setMMHUBBase(mmhubBase);
-    gpuvm.setMMHUBTop(mmhubTop);
-
-    nbio.setGPUDevice(this);
 }
 
 void
@@ -357,36 +397,28 @@ AMDGPUDevice::readDoorbell(PacketPtr pkt, Addr offset)
 void
 AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset)
 {
-    Addr aperture = gpuvm.getMmioAperture(offset);
-    Addr aperture_offset = offset - aperture;
+    AddrRange aperture = gpuvm.getMMIOAperture(offset);
+    Addr aperture_offset = offset - aperture.start();
 
     // By default read from MMIO trace. Overwrite the packet for a select
     // few more dynamic MMIOs.
     DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset);
     mmioReader.readFromTrace(pkt, MMIO_BAR, offset);
 
-    if (regs.find(offset) != regs.end()) {
-        uint64_t value = regs[offset];
-        DPRINTF(AMDGPUDevice, "Reading what kernel wrote before: %#x\n",
-                value);
-        pkt->setUintX(value, ByteOrder::little);
-    }
-
-    switch (aperture) {
-      case NBIO_BASE:
+    if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "NBIO base\n");
         nbio.readMMIO(pkt, aperture_offset);
-        break;
-      case GRBM_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GRBM base\n");
         gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        break;
-      case GFX_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GFX base\n");
         gfx.readMMIO(pkt, aperture_offset);
-        break;
-      case MMHUB_BASE:
+    } else if (aperture == gpuvm.getMMIORange(MMHUB_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "MMHUB base\n");
         gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT);
-        break;
-      default:
-        break;
+    } else {
+        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for read %#x\n", offset);
     }
 }
 
@@ -395,6 +427,12 @@ AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset)
 {
     DPRINTF(AMDGPUDevice, "Wrote framebuffer address %#lx\n", offset);
 
+    for (auto& cu: CP()->shader()->cuList) {
+        auto system = CP()->shader()->gpuCmdProc.system();
+        Addr aligned_addr = offset & ~(system->cacheLineSize() - 1);
+        cu->sendInvL2(aligned_addr);
+    }
+
     Addr aperture = gpuvm.getFrameAperture(offset);
     Addr aperture_offset = offset - aperture;
 
@@ -422,6 +460,8 @@ AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset)
 
     auto system = cp->shader()->gpuCmdProc.system();
     system->getDeviceMemory(writePkt)->access(writePkt);
+
+    delete writePkt;
 }
 
 void
@@ -430,17 +470,22 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
     DPRINTF(AMDGPUDevice, "Wrote doorbell %#lx\n", offset);
 
     if (doorbells.find(offset) != doorbells.end()) {
-        QueueType q_type = doorbells[offset];
+        QueueType q_type = doorbells[offset].qtype;
+        int ip_id = doorbells[offset].ip_id;
         DPRINTF(AMDGPUDevice, "Doorbell offset %p queue: %d\n",
                               offset, q_type);
         switch (q_type) {
           case Compute:
-            pm4PktProc->process(pm4PktProc->getQueue(offset),
-                                pkt->getLE<uint64_t>());
+            assert(pm4PktProcs.count(ip_id));
+            pm4PktProcs[ip_id]->process(
+                pm4PktProcs[ip_id]->getQueue(offset),
+                pkt->getLE<uint64_t>());
           break;
           case Gfx:
-            pm4PktProc->process(pm4PktProc->getQueue(offset, true),
-                                pkt->getLE<uint64_t>());
+            assert(pm4PktProcs.count(ip_id));
+            pm4PktProcs[ip_id]->process(
+                pm4PktProcs[ip_id]->getQueue(offset, true),
+                pkt->getLE<uint64_t>());
           break;
           case SDMAGfx: {
             SDMAEngine *sdmaEng = getSDMAEngine(offset);
@@ -451,9 +496,11 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
             sdmaEng->processPage(pkt->getLE<uint64_t>());
           } break;
           case ComputeAQL: {
+            assert(pm4PktProcs.count(ip_id));
             cp->hsaPacketProc().hwScheduler()->write(offset,
                 pkt->getLE<uint64_t>() + 1);
-            pm4PktProc->updateReadIndex(offset, pkt->getLE<uint64_t>() + 1);
+            pm4PktProcs[ip_id]->updateReadIndex(offset,
+                pkt->getLE<uint64_t>() + 1);
           } break;
           case InterruptHandler:
             deviceIH->updateRptr(pkt->getLE<uint32_t>());
@@ -470,10 +517,13 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
              offset);
 
         // We have to ACK the PCI packet immediately, so create a copy of the
-        // packet here to send again.
+        // packet here to send again. The packet data contains the value of
+        // the doorbell to write so we need to copy that as the original
+        // packet gets deleted after the PCI write() method returns.
         RequestPtr pending_req(pkt->req);
         PacketPtr pending_pkt = Packet::createWrite(pending_req);
         uint8_t *pending_data = new uint8_t[pkt->getSize()];
+        memcpy(pending_data, pkt->getPtr<uint8_t>(), pkt->getSize());
         pending_pkt->dataDynamic(pending_data);
 
         pendingDoorbellPkts.emplace(offset, pending_pkt);
@@ -483,12 +533,12 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
 void
 AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
 {
-    Addr aperture = gpuvm.getMmioAperture(offset);
-    Addr aperture_offset = offset - aperture;
+    AddrRange aperture = gpuvm.getMMIOAperture(offset);
+    Addr aperture_offset = offset - aperture.start();
 
     DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset);
 
-    // Check SDMA functions first, then fallback to switch statement
+    // Check SDMA functions first, then fallback to MMIO ranges.
     for (int idx = 0; idx < sdmaIds.size(); ++idx) {
         if (sdmaMmios[idx].contains(offset)) {
             Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2;
@@ -506,26 +556,31 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
         }
     }
 
-    switch (aperture) {
-      /* Write a general register to the graphics register bus manager. */
-      case GRBM_BASE:
+    // Check PM4s next, returning to avoid duplicate writes.
+    for (auto& [range, pm4_proc] : pm4Ranges) {
+        if (range.contains(offset)) {
+            // PM4 MMIOs are offset based on the MMIO range start
+            Addr ip_offset = offset - range.start();
+            pm4_proc->writeMMIO(pkt, ip_offset >> GRBM_OFFSET_SHIFT);
+
+            return;
+        }
+    }
+
+    if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GRBM base\n");
         gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        pm4PktProc->writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        break;
-      /* Write a register to the interrupt handler. */
-      case IH_BASE:
+    } else if (aperture == gpuvm.getMMIORange(IH_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "IH base\n");
         deviceIH->writeMMIO(pkt, aperture_offset >> IH_OFFSET_SHIFT);
-        break;
-      /* Write an IO space register */
-      case NBIO_BASE:
+    } else if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "NBIO base\n");
         nbio.writeMMIO(pkt, aperture_offset);
-        break;
-      case GFX_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GFX base\n");
         gfx.writeMMIO(pkt, aperture_offset);
-        break;
-      default:
-        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for %#x\n", offset);
-        break;
+    } else {
+        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for write %#x\n", offset);
     }
 }
 
@@ -610,33 +665,57 @@ AMDGPUDevice::processPendingDoorbells(uint32_t offset)
     }
 }
 
-bool
-AMDGPUDevice::haveRegVal(uint32_t addr)
-{
-    return regs.count(addr);
-}
-
 uint32_t
-AMDGPUDevice::getRegVal(uint32_t addr)
+AMDGPUDevice::getRegVal(uint64_t addr)
 {
+    // This is somewhat of a guess based on amdgpu_device_mm_access
+    // in amdgpu_device.c in the ROCk driver. If bit 32 is 1 then
+    // assume VRAM and use full address, otherwise assume register
+    // address and only user lower 31 bits.
+    Addr fixup_addr = bits(addr, 31, 31) ? addr : addr & 0x7fffffff;
+
+    uint32_t pkt_data = 0;
+    RequestPtr request = std::make_shared<Request>(fixup_addr,
+            sizeof(uint32_t), 0 /* flags */, vramRequestorId());
+    PacketPtr pkt = Packet::createRead(request);
+    pkt->dataStatic((uint8_t *)&pkt_data);
+    readMMIO(pkt, addr);
     DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n",
-            addr, regs[addr]);
-    return regs[addr];
+            fixup_addr, pkt->getLE<uint32_t>());
+
+    pkt_data = pkt->getLE<uint32_t>();
+    delete pkt;
+
+    return pkt_data;
 }
 
 void
-AMDGPUDevice::setRegVal(uint32_t addr, uint32_t value)
+AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value)
 {
     DPRINTF(AMDGPUDevice, "Setting register 0x%lx to %x\n",
             addr, value);
-    regs[addr] = value;
+
+    uint32_t pkt_data = value;
+    RequestPtr request = std::make_shared<Request>(addr,
+            sizeof(uint32_t), 0 /* flags */, vramRequestorId());
+    PacketPtr pkt = Packet::createWrite(request);
+    pkt->dataStatic((uint8_t *)&pkt_data);
+    writeMMIO(pkt, addr);
+    delete pkt;
 }
 
 void
-AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt)
+AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt, int ip_id)
 {
     DPRINTF(AMDGPUDevice, "Setting doorbell type for %x\n", offset);
-    doorbells[offset] = qt;
+    doorbells[offset].qtype = qt;
+    doorbells[offset].ip_id = ip_id;
+}
+
+void
+AMDGPUDevice::unsetDoorbell(uint32_t offset)
+{
+    doorbells.erase(offset);
 }
 
 void
@@ -675,22 +754,19 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
     // Serialize the PciDevice base class
     PciDevice::serialize(cp);
 
-    uint64_t regs_size = regs.size();
     uint64_t doorbells_size = doorbells.size();
     uint64_t sdma_engs_size = sdmaEngs.size();
     uint64_t used_vmid_map_size = usedVMIDs.size();
 
-    SERIALIZE_SCALAR(regs_size);
     SERIALIZE_SCALAR(doorbells_size);
     SERIALIZE_SCALAR(sdma_engs_size);
     // Save the number of vmids used
     SERIALIZE_SCALAR(used_vmid_map_size);
 
     // Make a c-style array of the regs to serialize
-    uint32_t reg_addrs[regs_size];
-    uint64_t reg_values[regs_size];
     uint32_t doorbells_offset[doorbells_size];
     QueueType doorbells_queues[doorbells_size];
+    int doorbells_ip_ids[doorbells_size];
     uint32_t sdma_engs_offset[sdma_engs_size];
     int sdma_engs[sdma_engs_size];
     int used_vmids[used_vmid_map_size];
@@ -698,16 +774,10 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
     std::vector<int> used_vmid_sets;
 
     int idx = 0;
-    for (auto & it : regs) {
-        reg_addrs[idx] = it.first;
-        reg_values[idx] = it.second;
-        ++idx;
-    }
-
-    idx = 0;
     for (auto & it : doorbells) {
         doorbells_offset[idx] = it.first;
-        doorbells_queues[idx] = it.second;
+        doorbells_queues[idx] = it.second.qtype;
+        doorbells_ip_ids[idx] = it.second.ip_id;
         ++idx;
     }
 
@@ -732,12 +802,12 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
     int* vmid_array = new int[num_queue_id];
     std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array);
 
-    SERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0]));
-    SERIALIZE_ARRAY(reg_values, sizeof(reg_values)/sizeof(reg_values[0]));
     SERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
         sizeof(doorbells_offset[0]));
     SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
         sizeof(doorbells_queues[0]));
+    SERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
+        sizeof(doorbells_ip_ids[0]));
     SERIALIZE_ARRAY(sdma_engs_offset, sizeof(sdma_engs_offset)/
         sizeof(sdma_engs_offset[0]));
     SERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0]));
@@ -764,43 +834,30 @@ AMDGPUDevice::unserialize(CheckpointIn &cp)
     // Unserialize the PciDevice base class
     PciDevice::unserialize(cp);
 
-    uint64_t regs_size = 0;
     uint64_t doorbells_size = 0;
     uint64_t sdma_engs_size = 0;
     uint64_t used_vmid_map_size = 0;
 
-    UNSERIALIZE_SCALAR(regs_size);
     UNSERIALIZE_SCALAR(doorbells_size);
     UNSERIALIZE_SCALAR(sdma_engs_size);
     UNSERIALIZE_SCALAR(used_vmid_map_size);
 
 
-    if (regs_size > 0) {
-        uint32_t reg_addrs[regs_size];
-        uint64_t reg_values[regs_size];
-
-        UNSERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0]));
-        UNSERIALIZE_ARRAY(reg_values,
-                          sizeof(reg_values)/sizeof(reg_values[0]));
-
-        for (int idx = 0; idx < regs_size; ++idx) {
-            regs.insert(std::make_pair(reg_addrs[idx], reg_values[idx]));
-        }
-    }
-
     if (doorbells_size > 0) {
         uint32_t doorbells_offset[doorbells_size];
         QueueType doorbells_queues[doorbells_size];
+        int doorbells_ip_ids[doorbells_size];
 
         UNSERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
                 sizeof(doorbells_offset[0]));
         UNSERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
                 sizeof(doorbells_queues[0]));
+        UNSERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
+                sizeof(doorbells_ip_ids[0]));
 
         for (int idx = 0; idx < doorbells_size; ++idx) {
-            regs.insert(std::make_pair(doorbells_offset[idx],
-                      doorbells_queues[idx]));
-            doorbells[doorbells_offset[idx]] = doorbells_queues[idx];
+            doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx];
+            doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx];
         }
     }
 
diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh
index b6b6e2a81a..83b79a1d05 100644
--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -87,9 +87,7 @@ class AMDGPUDevice : public PciDevice
     /**
      * Structures to hold registers, doorbells, and some frame memory
      */
-    using GPURegMap = std::unordered_map<uint32_t, uint64_t>;
-    GPURegMap regs;
-    std::unordered_map<uint32_t, QueueType> doorbells;
+    std::unordered_map<uint32_t, DoorbellInfo> doorbells;
     std::unordered_map<uint32_t, PacketPtr> pendingDoorbellPkts;
 
     /**
@@ -115,9 +113,19 @@ class AMDGPUDevice : public PciDevice
     AMDGPUMemoryManager *gpuMemMgr;
     AMDGPUInterruptHandler *deviceIH;
     AMDGPUVM gpuvm;
-    PM4PacketProcessor *pm4PktProc;
     GPUCommandProcessor *cp;
 
+    struct AddrRangeHasher
+    {
+        std::size_t operator()(const AddrRange& k) const
+        {
+            return k.start();
+        }
+    };
+    std::unordered_map<int, PM4PacketProcessor *> pm4PktProcs;
+    std::unordered_map<AddrRange, PM4PacketProcessor *,
+                       AddrRangeHasher> pm4Ranges;
+
     // SDMAs mapped by doorbell offset
     std::unordered_map<uint32_t, SDMAEngine *> sdmaEngs;
     // SDMAs mapped by ID
@@ -187,7 +195,8 @@ class AMDGPUDevice : public PciDevice
     /**
      * Set handles to GPU blocks.
      */
-    void setDoorbellType(uint32_t offset, QueueType qt);
+    void setDoorbellType(uint32_t offset, QueueType qt, int ip_id = 0);
+    void unsetDoorbell(uint32_t offset);
     void processPendingDoorbells(uint32_t offset);
     void setSDMAEngine(Addr offset, SDMAEngine *eng);
 
@@ -195,9 +204,8 @@ class AMDGPUDevice : public PciDevice
      * Register value getter/setter. Used by other GPU blocks to change
      * values from incoming driver/user packets.
      */
-    bool haveRegVal(uint32_t addr);
-    uint32_t getRegVal(uint32_t addr);
-    void setRegVal(uint32_t addr, uint32_t value);
+    uint32_t getRegVal(uint64_t addr);
+    void setRegVal(uint64_t addr, uint32_t value);
 
     /**
      * Methods related to translations and system/device memory.
diff --git a/src/dev/amdgpu/amdgpu_gfx.cc b/src/dev/amdgpu/amdgpu_gfx.cc
index 3d5b274b86..60fabaf31d 100644
--- a/src/dev/amdgpu/amdgpu_gfx.cc
+++ b/src/dev/amdgpu/amdgpu_gfx.cc
@@ -37,6 +37,13 @@
 namespace gem5
 {
 
+AMDGPUGfx::AMDGPUGfx()
+{
+    for (int i = 0; i < SCRATCH_REGS; ++i) {
+        scratchRegs[i] = 0;
+    }
+}
+
 void
 AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
 {
@@ -47,6 +54,9 @@ AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
       case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB:
         pkt->setLE<uint32_t>(captured_clock_count >> 32);
         break;
+      case AMDGPU_MM_SCRATCH_REG0:
+        pkt->setLE<uint32_t>(scratchRegs[0]);
+        break;
       default:
         break;
     }
@@ -65,6 +75,9 @@ AMDGPUGfx::writeMMIO(PacketPtr pkt, Addr offset)
           captured_clock_count = curTick() / sim_clock::as_int::ns;
         }
         break;
+      case AMDGPU_MM_SCRATCH_REG0:
+        scratchRegs[0] = pkt->getLE<uint32_t>();
+        break;
       default:
         break;
     }
diff --git a/src/dev/amdgpu/amdgpu_gfx.hh b/src/dev/amdgpu/amdgpu_gfx.hh
index c32b8624cf..9fb1d82553 100644
--- a/src/dev/amdgpu/amdgpu_gfx.hh
+++ b/src/dev/amdgpu/amdgpu_gfx.hh
@@ -52,13 +52,16 @@
 #define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB                 0x13094
 #define AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT             0x13098
 
+// Scratch registers used for GPU post
+#define AMDGPU_MM_SCRATCH_REG0                            0x08100
+
 namespace gem5
 {
 
 class AMDGPUGfx
 {
   public:
-    AMDGPUGfx() { }
+    AMDGPUGfx();
 
     void readMMIO(PacketPtr pkt, Addr offset);
     void writeMMIO(PacketPtr pkt, Addr offset);
@@ -68,6 +71,12 @@ class AMDGPUGfx
      * GPU clock count at the time capture MMIO is received.
      */
     uint64_t captured_clock_count = 1;
+
+    /*
+     * Scratch registers.
+     */
+    static constexpr int SCRATCH_REGS = 8;
+    std::array<uint32_t, SCRATCH_REGS> scratchRegs;
 };
 
 } // namespace gem5
diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc
index 07027c3765..ec44f16250 100644
--- a/src/dev/amdgpu/amdgpu_nbio.cc
+++ b/src/dev/amdgpu/amdgpu_nbio.cc
@@ -53,22 +53,44 @@ AMDGPUNbio::setGPUDevice(AMDGPUDevice *gpu_device)
 void
 AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
 {
+    // For Vega10 we rely on the golden values in an MMIO trace. Return
+    // immediately as to not clobber those values.
+    if (gpuDevice->getGfxVersion() == GfxVersion::gfx900) {
+        if (offset == AMDGPU_PCIE_DATA || offset == AMDGPU_PCIE_DATA2) {
+            return;
+        }
+    }
+
     switch (offset) {
-      // This is a PCIe status register. At some point during driver init
-      // the driver checks that interrupts are enabled. This is only
-      // checked once, so if the MMIO trace does not exactly line up with
-      // what the driver is doing in gem5, this may still have the first
-      // bit zero causing driver to fail. Therefore, we always set this
-      // bit to one as there is no harm to do so.
-      case AMDGPU_PCIE_DATA_REG:
+      // PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect
+      // "register reads/writes from the driver. This provides a way to read
+      // any register by providing a 32-bit address to one of the two INDEX
+      // registers and then reading the corresponding DATA register. See:
+      // https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/
+      //     gpu/drm/amd/amdgpu/amdgpu_device.c#L459
+      case AMDGPU_PCIE_DATA:
         {
-          uint32_t value = pkt->getLE<uint32_t>() | 0x1;
-          DPRINTF(AMDGPUDevice, "Marking interrupts enabled: %#lx\n", value);
+          uint32_t value = gpuDevice->getRegVal(pcie_index_reg);
+          DPRINTF(AMDGPUDevice, "Read PCIe index %lx data %x\n",
+                  pcie_index_reg, value);
           pkt->setLE<uint32_t>(value);
         }
         break;
+      case AMDGPU_PCIE_DATA2:
+        {
+          uint32_t value = gpuDevice->getRegVal(pcie_index2_reg);
+          DPRINTF(AMDGPUDevice, "Read PCIe index2 %lx data2 %x\n",
+                  pcie_index2_reg, value);
+          pkt->setLE<uint32_t>(value);
+        }
+        break;
+      case AMDGPU_PCIE_INDEX:
+        pkt->setLE<uint32_t>(pcie_index_reg);
+        break;
+      case AMDGPU_PCIE_INDEX2:
+        pkt->setLE<uint32_t>(pcie_index2_reg);
+        break;
       case AMDGPU_MM_DATA:
-        //pkt->setLE<uint32_t>(regs[mm_index_reg]);
         pkt->setLE<uint32_t>(gpuDevice->getRegVal(mm_index_reg));
         break;
       case VEGA10_INV_ENG17_ACK1:
@@ -89,17 +111,17 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
       case AMDGPU_MP0_SMN_C2PMSG_35:
         pkt->setLE<uint32_t>(0x80000000);
         break;
+      case AMDGPU_MP1_SMN_C2PMSG_90:
+        pkt->setLE<uint32_t>(0x1);
+        break;
       default:
         if (triggered_reads.count(offset)) {
             DPRINTF(AMDGPUDevice, "Found triggered read for %#x\n", offset);
             pkt->setLE<uint32_t>(triggered_reads[offset]);
-        } else if (gpuDevice->haveRegVal(offset)) {
-            uint32_t reg_val = gpuDevice->getRegVal(offset);
-
-            DPRINTF(AMDGPUDevice, "Reading value of %#lx from regs: %#lx\n",
-                    offset, reg_val);
-
-            pkt->setLE<uint32_t>(reg_val);
+        } else if (regs.count(offset)) {
+            DPRINTF(AMDGPUDevice, "Returning value of unknown MMIO offset "
+                    "%x: %x\n", offset, regs[offset]);
+            pkt->setLE<uint32_t>(regs[offset]);
         } else {
             DPRINTF(AMDGPUDevice, "NBIO Unknown MMIO %#x (%#x)\n", offset,
                     pkt->getAddr());
@@ -123,6 +145,24 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset)
         DPRINTF(AMDGPUDevice, "MM write to reg %#lx data %#lx\n",
                 mm_index_reg, pkt->getLE<uint32_t>());
         gpuDevice->setRegVal(AMDGPU_MM_DATA, pkt->getLE<uint32_t>());
+    // PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect
+    // "register reads/writes from the driver. This provides a way to read
+    // any register by providing a 32-bit address to one of the two INDEX
+    // registers and then reading the corresponding DATA register. See:
+    // https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/
+    //     gpu/drm/amd/amdgpu/amdgpu_device.c#L459
+    } else if (offset == AMDGPU_PCIE_INDEX) {
+        assert(pkt->getSize() == 4);
+        pcie_index_reg = pkt->getLE<uint32_t>();
+    } else if (offset == AMDGPU_PCIE_DATA) {
+        assert(pkt->getSize() == 4);
+        gpuDevice->setRegVal(pcie_index_reg, pkt->getLE<uint32_t>());
+    } else if (offset == AMDGPU_PCIE_INDEX2) {
+        assert(pkt->getSize() == 4);
+        pcie_index2_reg = pkt->getLE<uint32_t>();
+    } else if (offset == AMDGPU_PCIE_DATA2) {
+        assert(pkt->getSize() == 4);
+        gpuDevice->setRegVal(pcie_index2_reg, pkt->getLE<uint32_t>());
     } else if (offset == AMDGPU_MP0_SMN_C2PMSG_35) {
         // See psp_v3_1_bootloader_load_sos in amdgpu driver code.
         if (pkt->getLE<uint32_t>() == 0x10000) {
@@ -144,6 +184,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset)
     } else if (offset == AMDGPU_MP0_SMN_C2PMSG_71) {
         // PSP ring size
         psp_ring_size = pkt->getLE<uint32_t>();
+    } else {
+        // Fallback to a map of register values. This was previously in the
+        // AMDGPUDevice, however that short-circuited some reads from other
+        // IP blocks. Since this is an end point IP block it is safer to use
+        // here.
+        regs[offset] = pkt->getLE<uint32_t>();
+        DPRINTF(AMDGPUDevice, "Writing value of unknown MMIO offset "
+                "%x: %x\n", offset, regs[offset]);
     }
 }
 
diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh
index dc95443916..87afb02c41 100644
--- a/src/dev/amdgpu/amdgpu_nbio.hh
+++ b/src/dev/amdgpu/amdgpu_nbio.hh
@@ -56,7 +56,11 @@ class AMDGPUDevice;
 #define AMDGPU_MM_INDEX                                   0x00000
 #define AMDGPU_MM_INDEX_HI                                0x00018
 #define AMDGPU_MM_DATA                                    0x00004
-#define AMDGPU_PCIE_DATA_REG                              0x0003c
+
+#define AMDGPU_PCIE_INDEX                                 0x00030
+#define AMDGPU_PCIE_INDEX2                                0x00038
+#define AMDGPU_PCIE_DATA                                  0x00034
+#define AMDGPU_PCIE_DATA2                                 0x0003c
 
 // Message bus related to psp
 #define AMDGPU_MP0_SMN_C2PMSG_33                          0x58184
@@ -66,6 +70,7 @@ class AMDGPUDevice;
 #define AMDGPU_MP0_SMN_C2PMSG_70                          0x58218
 #define AMDGPU_MP0_SMN_C2PMSG_71                          0x5821c
 #define AMDGPU_MP0_SMN_C2PMSG_81                          0x58244
+#define AMDGPU_MP1_SMN_C2PMSG_90                          0x58a68
 
 // Device specific invalidation engines used during initialization
 #define VEGA10_INV_ENG17_ACK1                             0x0a318
@@ -105,6 +110,8 @@ class AMDGPUNbio
      * Driver initialization sequence helper variables.
      */
     uint64_t mm_index_reg = 0;
+    uint32_t pcie_index_reg = 0;
+    uint32_t pcie_index2_reg = 0;
     std::unordered_map<uint32_t, uint32_t> triggered_reads;
 
     /*
@@ -115,6 +122,12 @@ class AMDGPUNbio
     Addr psp_ring_listen_addr = 0;
     int psp_ring_size = 0;
     int psp_ring_value = 0;
+
+    /*
+     * Hold values of other registers not explicitly modelled by other blocks.
+     */
+    using GPURegMap = std::unordered_map<uint64_t, uint32_t>;
+    GPURegMap regs;
 };
 
 } // namespace gem5
diff --git a/src/dev/amdgpu/amdgpu_vm.cc b/src/dev/amdgpu/amdgpu_vm.cc
index 5a13ac9ba0..0eea590c5a 100644
--- a/src/dev/amdgpu/amdgpu_vm.cc
+++ b/src/dev/amdgpu/amdgpu_vm.cc
@@ -37,6 +37,7 @@
 #include "base/trace.hh"
 #include "debug/AMDGPUDevice.hh"
 #include "dev/amdgpu/amdgpu_defines.hh"
+#include "dev/amdgpu/amdgpu_device.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -51,6 +52,35 @@ AMDGPUVM::AMDGPUVM()
     for (int i = 0; i < AMDGPU_VM_COUNT; ++i) {
         memset(&vmContexts[0], 0, sizeof(AMDGPUVMContext));
     }
+
+    for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
+        mmioRanges[i] = AddrRange();
+    }
+}
+
+void
+AMDGPUVM::setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range)
+{
+    mmioRanges[mmio_aperture] = range;
+}
+
+AddrRange
+AMDGPUVM::getMMIORange(mmio_range_t mmio_aperture)
+{
+    return mmioRanges[mmio_aperture];
+}
+
+const AddrRange&
+AMDGPUVM::getMMIOAperture(Addr offset)
+{
+    for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
+        if (mmioRanges[i].contains(offset)) {
+            return mmioRanges[i];
+        }
+    }
+
+    // Default to NBIO
+    return mmioRanges[NBIO_MMIO_RANGE];
 }
 
 Addr
diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh
index f35a735111..857ef724da 100644
--- a/src/dev/amdgpu/amdgpu_vm.hh
+++ b/src/dev/amdgpu/amdgpu_vm.hh
@@ -99,9 +99,23 @@ static constexpr int AMDGPU_USER_PAGE_SIZE = 4096;
 namespace gem5
 {
 
+typedef enum : int
+{
+    NBIO_MMIO_RANGE,
+    MMHUB_MMIO_RANGE,
+    GFX_MMIO_RANGE,
+    GRBM_MMIO_RANGE,
+    IH_MMIO_RANGE,
+    NUM_MMIO_RANGES
+} mmio_range_t;
+
+class AMDGPUDevice;
+
 class AMDGPUVM : public Serializable
 {
   private:
+    AMDGPUDevice *gpuDevice;
+
     typedef struct GEM5_PACKED
     {
         // Page table addresses: from (Base + Start) to (End)
@@ -160,9 +174,13 @@ class AMDGPUVM : public Serializable
      */
     std::vector<VegaISA::GpuTLB *> gpu_tlbs;
 
+    std::array<AddrRange, NUM_MMIO_RANGES> mmioRanges;
+
   public:
     AMDGPUVM();
 
+    void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; }
+
     /**
      * Return base address of GART table in framebuffer.
      */
@@ -172,6 +190,12 @@ class AMDGPUVM : public Serializable
      */
     Addr gartSize();
 
+    bool
+    inGARTRange(Addr paddr)
+    {
+        return ((paddr >= gartBase()) && (paddr <= (gartBase() + gartSize())));
+    }
+
     /**
      * Copy of GART table. Typically resides in device memory, however we use
      * a copy in gem5 to simplify the interface.
@@ -226,38 +250,11 @@ class AMDGPUVM : public Serializable
     Addr getSysAddrRangeLow () { return vmContext0.sysAddrL; }
     Addr getSysAddrRangeHigh () { return vmContext0.sysAddrH; }
 
-    Addr
-    getMmioAperture(Addr addr)
-    {
-        // Aperture ranges:
-        // NBIO               0x0     - 0x4280
-        // IH                 0x4280  - 0x4980
-        // SDMA0              0x4980  - 0x5180
-        // SDMA1              0x5180  - 0x5980
-        // GRBM               0x8000  - 0xD000
-        // GFX                0x28000 - 0x3F000
-        // MMHUB              0x68000 - 0x6a120
+    void setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range);
+    const AddrRange& getMMIOAperture(Addr addr);
+    AddrRange getMMIORange(mmio_range_t mmio_aperture);
 
-        if (IH_BASE <= addr && addr < IH_BASE + IH_SIZE)
-            return IH_BASE;
-        else if (SDMA0_BASE <= addr && addr < SDMA0_BASE + SDMA_SIZE)
-            return SDMA0_BASE;
-        else if (SDMA1_BASE <= addr && addr < SDMA1_BASE + SDMA_SIZE)
-            return SDMA1_BASE;
-        else if (GRBM_BASE <= addr && addr < GRBM_BASE + GRBM_SIZE)
-            return GRBM_BASE;
-        else if (GFX_BASE <= addr && addr < GFX_BASE + GFX_SIZE)
-            return GFX_BASE;
-        else if (MMHUB_BASE <= addr && addr < MMHUB_BASE + MMHUB_SIZE)
-            return MMHUB_BASE;
-        else {
-            warn_once("Accessing unsupported MMIO aperture! Assuming NBIO\n");
-            return NBIO_BASE;
-        }
-
-    }
-
-    // Gettig mapped aperture base addresses
+    // Getting mapped aperture base addresses
     Addr
     getFrameAperture(Addr addr)
     {
diff --git a/src/dev/amdgpu/hwreg_defines.hh b/src/dev/amdgpu/hwreg_defines.hh
index f5097c8994..30ad58457d 100644
--- a/src/dev/amdgpu/hwreg_defines.hh
+++ b/src/dev/amdgpu/hwreg_defines.hh
@@ -44,7 +44,7 @@ namespace gem5
 
 /*
  * Further descriptions can be found in the "Hardware Register Values" table
- * in any of the GCN3, Vega, CDNA1, CNDA2, or RDNA ISA manuals.
+ * in any of the Vega, CDNA, or RDNA ISA manuals.
  */
 enum amdgpu_hwreg
 {
diff --git a/src/dev/amdgpu/interrupt_handler.cc b/src/dev/amdgpu/interrupt_handler.cc
index 6f277a1618..4ad2527108 100644
--- a/src/dev/amdgpu/interrupt_handler.cc
+++ b/src/dev/amdgpu/interrupt_handler.cc
@@ -75,7 +75,8 @@ void
 AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id,
                                                 uint32_t ring_id,
                                                 uint32_t client_id,
-                                                uint32_t source_id)
+                                                uint32_t source_id,
+                                                unsigned node_id)
 {
     assert(client_id == SOC15_IH_CLIENTID_RLC ||
            client_id == SOC15_IH_CLIENTID_SDMA0 ||
@@ -112,6 +113,7 @@ AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id,
     cookie->clientId = client_id;
     cookie->sourceId = source_id;
     cookie->ringId = ring_id;
+    cookie->nodeId = node_id;
     cookie->source_data_dw1 = cntxt_id;
     interruptQueue.push(cookie);
 }
@@ -128,6 +130,10 @@ AMDGPUInterruptHandler::DmaEvent::process()
     } else {
         fatal("Interrupt Handler DMA event returned bad value: %d\n", data);
     }
+
+    if (dataPtr) {
+        delete [] dataPtr;
+    }
 }
 
 void
@@ -138,7 +144,7 @@ AMDGPUInterruptHandler::submitWritePointer()
     Addr paddr = regs.WptrAddr;
     std::memcpy(dataPtr, &regs.IH_Wptr, sizeof(uint32_t));
 
-    dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 2);
+    dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 2, dataPtr);
     dmaWrite(paddr, sizeof(uint32_t), dmaEvent, dataPtr);
 }
 
@@ -155,7 +161,7 @@ AMDGPUInterruptHandler::submitInterruptCookie()
 
     DPRINTF(AMDGPUDevice, "InterruptHandler rptr: 0x%x wptr: 0x%x\n",
                           regs.IH_Rptr, regs.IH_Wptr);
-    dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 1);
+    dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 1, dataPtr);
     dmaWrite(paddr, cookieSize, dmaEvent, dataPtr);
 
     interruptQueue.pop();
diff --git a/src/dev/amdgpu/interrupt_handler.hh b/src/dev/amdgpu/interrupt_handler.hh
index 9b80e081cc..b7ac4b29ee 100644
--- a/src/dev/amdgpu/interrupt_handler.hh
+++ b/src/dev/amdgpu/interrupt_handler.hh
@@ -101,7 +101,8 @@ typedef struct
     uint32_t reserved2 : 15;
     uint32_t timestamp_src : 1;
     uint32_t pasid : 16;
-    uint32_t reserved3 : 15;
+    uint32_t nodeId : 8;
+    uint32_t reserved3 : 7;
     uint32_t pasid_src : 1;
     uint32_t source_data_dw1;
     uint32_t source_data_dw2;
@@ -135,10 +136,12 @@ class AMDGPUInterruptHandler : public DmaDevice
       private:
         AMDGPUInterruptHandler *deviceIh;
         uint32_t data;
+        uint8_t *dataPtr;
 
       public:
-        DmaEvent(AMDGPUInterruptHandler *deviceIh, uint32_t data)
-            : Event(), deviceIh(deviceIh), data(data)
+        DmaEvent(AMDGPUInterruptHandler *deviceIh, uint32_t data,
+                 uint8_t* _dataPtr)
+            : Event(), deviceIh(deviceIh), data(data), dataPtr(_dataPtr)
         {
             setFlags(Event::AutoDelete);
         }
@@ -171,7 +174,7 @@ class AMDGPUInterruptHandler : public DmaDevice
 
     void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; }
     void prepareInterruptCookie(ContextID cntxtId, uint32_t ring_id,
-        uint32_t client_id, uint32_t source_id);
+        uint32_t client_id, uint32_t source_id, unsigned node_id);
     void submitInterruptCookie();
     void submitWritePointer();
     void intrPost();
diff --git a/src/dev/amdgpu/pm4_defines.hh b/src/dev/amdgpu/pm4_defines.hh
index a303f8ef84..d00dc3730d 100644
--- a/src/dev/amdgpu/pm4_defines.hh
+++ b/src/dev/amdgpu/pm4_defines.hh
@@ -328,8 +328,8 @@ typedef struct GEM5_PACKED
         };
         uint64_t completionSignal;
     };
-}  PM4MapProcessMI200;
-static_assert(sizeof(PM4MapProcessMI200) == 80);
+}  PM4MapProcessV2;
+static_assert(sizeof(PM4MapProcessV2) == 80);
 
 typedef struct GEM5_PACKED
 {
diff --git a/src/dev/amdgpu/pm4_mmio.hh b/src/dev/amdgpu/pm4_mmio.hh
index 3801223175..e9e504c3cd 100644
--- a/src/dev/amdgpu/pm4_mmio.hh
+++ b/src/dev/amdgpu/pm4_mmio.hh
@@ -36,34 +36,34 @@
 namespace gem5
 {
 
-#define mmCP_RB0_BASE                                                 0x1040
-#define mmCP_RB0_CNTL                                                 0x1041
-#define mmCP_RB_WPTR_POLL_ADDR_LO                                     0x1046
-#define mmCP_RB_WPTR_POLL_ADDR_HI                                     0x1047
-#define mmCP_RB_VMID                                                  0x1051
-#define mmCP_RB0_RPTR_ADDR                                            0x1043
-#define mmCP_RB0_RPTR_ADDR_HI                                         0x1044
-#define mmCP_RB0_WPTR                                                 0x1054
-#define mmCP_RB0_WPTR_HI                                              0x1055
-#define mmCP_RB_DOORBELL_CONTROL                                      0x1059
-#define mmCP_RB_DOORBELL_RANGE_LOWER                                  0x105a
-#define mmCP_RB_DOORBELL_RANGE_UPPER                                  0x105b
-#define mmCP_RB0_BASE_HI                                              0x10b1
+#define mmCP_RB0_BASE                                                 0x040
+#define mmCP_RB0_CNTL                                                 0x041
+#define mmCP_RB_WPTR_POLL_ADDR_LO                                     0x046
+#define mmCP_RB_WPTR_POLL_ADDR_HI                                     0x047
+#define mmCP_RB_VMID                                                  0x051
+#define mmCP_RB0_RPTR_ADDR                                            0x043
+#define mmCP_RB0_RPTR_ADDR_HI                                         0x044
+#define mmCP_RB0_WPTR                                                 0x054
+#define mmCP_RB0_WPTR_HI                                              0x055
+#define mmCP_RB_DOORBELL_CONTROL                                      0x059
+#define mmCP_RB_DOORBELL_RANGE_LOWER                                  0x05a
+#define mmCP_RB_DOORBELL_RANGE_UPPER                                  0x05b
+#define mmCP_RB0_BASE_HI                                              0x0b1
 
-#define mmCP_HQD_ACTIVE                                               0x1247
-#define mmCP_HQD_VMID                                                 0x1248
-#define mmCP_HQD_PQ_BASE                                              0x124d
-#define mmCP_HQD_PQ_BASE_HI                                           0x124e
-#define mmCP_HQD_PQ_DOORBELL_CONTROL                                  0x1254
-#define mmCP_HQD_PQ_RPTR                                              0x124f
-#define mmCP_HQD_PQ_RPTR_REPORT_ADDR                                  0x1250
-#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI                               0x1251
-#define mmCP_HQD_PQ_WPTR_POLL_ADDR                                    0x1252
-#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI                                 0x1253
-#define mmCP_HQD_PQ_CONTROL                                           0x1256
-#define mmCP_HQD_IB_CONTROL                                           0x125a
-#define mmCP_HQD_PQ_WPTR_LO                                           0x127b
-#define mmCP_HQD_PQ_WPTR_HI                                           0x127c
+#define mmCP_HQD_ACTIVE                                               0x247
+#define mmCP_HQD_VMID                                                 0x248
+#define mmCP_HQD_PQ_BASE                                              0x24d
+#define mmCP_HQD_PQ_BASE_HI                                           0x24e
+#define mmCP_HQD_PQ_DOORBELL_CONTROL                                  0x254
+#define mmCP_HQD_PQ_RPTR                                              0x24f
+#define mmCP_HQD_PQ_RPTR_REPORT_ADDR                                  0x250
+#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI                               0x251
+#define mmCP_HQD_PQ_WPTR_POLL_ADDR                                    0x252
+#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI                                 0x253
+#define mmCP_HQD_PQ_CONTROL                                           0x256
+#define mmCP_HQD_IB_CONTROL                                           0x25a
+#define mmCP_HQD_PQ_WPTR_LO                                           0x27b
+#define mmCP_HQD_PQ_WPTR_HI                                           0x27c
 
 } // namespace gem5
 
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index 5f270a0c70..9a8ba13914 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -49,7 +49,7 @@ namespace gem5
 {
 
 PM4PacketProcessor::PM4PacketProcessor(const PM4PacketProcessorParams &p)
-    : DmaVirtDevice(p)
+    : DmaVirtDevice(p), _ipId(p.ip_id), _mmioRange(p.mmio_range)
 {
     memset(&kiq, 0, sizeof(QueueDesc));
     memset(&pq, 0, sizeof(QueueDesc));
@@ -144,7 +144,7 @@ PM4PacketProcessor::newQueue(QueueDesc *mqd, Addr offset,
     QueueType qt;
     qt = mqd->aql ? QueueType::ComputeAQL
                   : QueueType::Compute;
-    gpuDevice->setDoorbellType(offset, qt);
+    gpuDevice->setDoorbellType(offset, qt, getIpId());
 
     DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p, me: "
             "%d, pipe %d queue: %d size: %d\n", id, q->base(), q->offset(),
@@ -227,9 +227,11 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
         } break;
       case IT_WRITE_DATA: {
         dmaBuffer = new PM4WriteData();
+        DPRINTF(PM4PacketProcessor, "PM4 writeData header: %x, count: %d\n",
+                header.ordinal, header.count);
         cb = new DmaVirtCallback<uint64_t>(
             [ = ] (const uint64_t &)
-                { writeData(q, (PM4WriteData *)dmaBuffer); });
+                { writeData(q, (PM4WriteData *)dmaBuffer, header); });
         dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4WriteData), cb,
                     dmaBuffer);
         } break;
@@ -288,18 +290,19 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
                     dmaBuffer);
         } break;
       case IT_MAP_PROCESS: {
-        if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) {
-            dmaBuffer = new PM4MapProcessMI200();
+        if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a ||
+            gpuDevice->getGfxVersion() == GfxVersion::gfx942) {
+            dmaBuffer = new PM4MapProcessV2();
             cb = new DmaVirtCallback<uint64_t>(
                 [ = ] (const uint64_t &)
-                    { mapProcessGfx90a(q, (PM4MapProcessMI200 *)dmaBuffer); });
-            dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200),
+                    { mapProcessV2(q, (PM4MapProcessV2 *)dmaBuffer); });
+            dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessV2),
                         cb, dmaBuffer);
         } else {
             dmaBuffer = new PM4MapProcess();
             cb = new DmaVirtCallback<uint64_t>(
                 [ = ] (const uint64_t &)
-                    { mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); });
+                    { mapProcessV1(q, (PM4MapProcess *)dmaBuffer); });
             dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
                         dmaBuffer);
         }
@@ -350,21 +353,46 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
 }
 
 void
-PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt)
+PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header)
 {
     q->incRptr(sizeof(PM4WriteData));
 
-    Addr addr = getGARTAddr(pkt->destAddr);
-    DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p.\n", addr,
-            pkt->data);
-    auto cb = new DmaVirtCallback<uint32_t>(
-        [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
-    //TODO: the specs indicate that pkt->data holds the number of dword that
-    //need to be written.
-    dmaWriteVirt(addr, sizeof(uint32_t), cb, &pkt->data);
+    DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p destSel: %d "
+            "addrIncr: %d resume: %d writeConfirm: %d cachePolicy: %d\n",
+            pkt->destAddr, pkt->data, pkt->destSel, pkt->addrIncr,
+            pkt->resume, pkt->writeConfirm, pkt->cachePolicy);
 
-    if (!pkt->writeConfirm)
+    if (pkt->destSel == 5) {
+        // Memory address destination
+        Addr addr = getGARTAddr(pkt->destAddr);
+
+        // This is a variable length packet. The size of the packet is in
+        // the header.count field and is set as Number Of Dwords - 1. This
+        // packet is 4 bytes minuimum meaning the count is minimum 3. To
+        // get the number of dwords of data subtract two from the count.
+        unsigned size = (header.count - 2) * sizeof(uint32_t);
+
+        DPRINTF(PM4PacketProcessor, "Writing %d bytes to %p\n", size, addr);
+        auto cb = new DmaVirtCallback<uint32_t>(
+            [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
+        dmaWriteVirt(addr, size, cb, &pkt->data);
+
+        if (!pkt->writeConfirm) {
+            decodeNext(q);
+        }
+    } else if (pkt->destSel == 0) {
+        // Register dword address destination
+        Addr byte_addr = pkt->destAddr << 2;
+
+        gpuDevice->setRegVal(byte_addr, pkt->data);
+
+        // setRegVal is instant on the simulated device so we ignore write
+        // confirm.
+        delete pkt;
         decodeNext(q);
+    } else {
+        fatal("Unknown PM4 writeData destination %d\n", pkt->destSel);
+    }
 }
 
 void
@@ -373,8 +401,9 @@ PM4PacketProcessor::writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr)
     DPRINTF(PM4PacketProcessor, "PM4 write completed to %p, %p.\n", addr,
             pkt->data);
 
-    if (pkt->writeConfirm)
+    if (pkt->writeConfirm) {
         decodeNext(q);
+    }
 
     delete pkt;
 }
@@ -428,8 +457,6 @@ PM4PacketProcessor::mapQueues(PM4Queue *q, PM4MapQueues *pkt)
     } else {
         panic("Unknown engine for MQD: %d\n", pkt->engineSel);
     }
-
-    decodeNext(q);
 }
 
 void
@@ -466,6 +493,9 @@ PM4PacketProcessor::processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
             "hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql);
 
     gpuDevice->processPendingDoorbells(offset);
+
+    delete pkt;
+    decodeNext(q);
 }
 
 void
@@ -493,9 +523,12 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
 
     // Register doorbell with GPU device
     gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
-    gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC);
+    gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId());
 
     gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
+
+    delete pkt;
+    decodeNext(q);
 }
 
 void
@@ -537,7 +570,8 @@ PM4PacketProcessor::releaseMemDone(PM4Queue *q, PM4ReleaseMem *pkt, Addr addr)
             ringId = (q->queue() << 4) | (q->me() << 2) | q->pipe();
         }
         gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId,
-                                            SOC15_IH_CLIENTID_GRBM_CP, CP_EOP);
+                                            SOC15_IH_CLIENTID_GRBM_CP, CP_EOP,
+                                            0);
         gpuDevice->getIH()->submitInterruptCookie();
     }
 
@@ -627,6 +661,7 @@ PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt)
                 dmaWriteVirt(addr, sizeof(QueueDesc), cb, mqd);
                 queues.erase(id);
                 hsa_pp.unsetDeviceQueueDesc(id, 8);
+                delete mqd;
             }
         }
         gpuDevice->deallocateAllQueues();
@@ -667,7 +702,7 @@ PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
 }
 
 void
-PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
+PM4PacketProcessor::mapProcessV1(PM4Queue *q, PM4MapProcess *pkt)
 {
     q->incRptr(sizeof(PM4MapProcess));
 
@@ -682,9 +717,9 @@ PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt)
 }
 
 void
-PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt)
+PM4PacketProcessor::mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt)
 {
-    q->incRptr(sizeof(PM4MapProcessMI200));
+    q->incRptr(sizeof(PM4MapProcessV2));
 
     DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
             "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
@@ -725,6 +760,7 @@ PM4PacketProcessor::indirectBuffer(PM4Queue *q, PM4IndirectBuf *pkt)
     q->ibBase(pkt->ibBase);
     q->wptr(pkt->ibSize * sizeof(uint32_t));
 
+    delete pkt;
     decodeNext(q);
 }
 
@@ -737,6 +773,7 @@ PM4PacketProcessor::switchBuffer(PM4Queue *q, PM4SwitchBuf *pkt)
     DPRINTF(PM4PacketProcessor, "PM4 switching buffer, rptr: %p.\n",
             q->wptr());
 
+    delete pkt;
     decodeNext(q);
 }
 
@@ -745,11 +782,17 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt)
 {
     q->incRptr(sizeof(PM4SetUconfigReg));
 
+    DPRINTF(PM4PacketProcessor, "SetUconfig offset %x data %x\n",
+            pkt->offset, pkt->data);
+
     // SET_UCONFIG_REG_START and pkt->offset are dword addresses
     uint32_t reg_addr = (PACKET3_SET_UCONFIG_REG_START + pkt->offset) * 4;
 
+    // Additional CPs respond to addresses 0x40000 apart.
+    reg_addr += 0x40000 * getIpId();
     gpuDevice->setRegVal(reg_addr, pkt->data);
 
+    delete pkt;
     decodeNext(q);
 }
 
@@ -766,6 +809,7 @@ PM4PacketProcessor::waitRegMem(PM4Queue *q, PM4WaitRegMem *pkt)
     DPRINTF(PM4PacketProcessor, "    Mask: %lx\n", pkt->mask);
     DPRINTF(PM4PacketProcessor, "    Poll Interval: %lx\n", pkt->pollInterval);
 
+    delete pkt;
     decodeNext(q);
 }
 
@@ -822,7 +866,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset)
         break;
       case mmCP_HQD_PQ_DOORBELL_CONTROL:
         setHqdPqDoorbellCtrl(pkt->getLE<uint32_t>());
-        gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute);
+        gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute, getIpId());
         break;
       case mmCP_HQD_PQ_RPTR:
         setHqdPqPtr(pkt->getLE<uint32_t>());
@@ -884,7 +928,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset)
         break;
       case mmCP_RB_DOORBELL_CONTROL:
         setRbDoorbellCntrl(pkt->getLE<uint32_t>());
-        gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx);
+        gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx, getIpId());
         break;
       case mmCP_RB_DOORBELL_RANGE_LOWER:
         setRbDoorbellRangeLo(pkt->getLE<uint32_t>());
diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh
index 3fb055148c..71271415fd 100644
--- a/src/dev/amdgpu/pm4_packet_processor.hh
+++ b/src/dev/amdgpu/pm4_packet_processor.hh
@@ -63,6 +63,10 @@ class PM4PacketProcessor : public DmaVirtDevice
     std::unordered_map<uint16_t, PM4Queue *> queues;
     /* A map of PM4 queues based on doorbell offset */
     std::unordered_map<uint32_t, PM4Queue *> queuesMap;
+
+    int _ipId;
+    AddrRange _mmioRange;
+
   public:
     PM4PacketProcessor(const PM4PacketProcessorParams &p);
 
@@ -136,14 +140,14 @@ class PM4PacketProcessor : public DmaVirtDevice
     void decodeHeader(PM4Queue *q, PM4Header header);
 
     /* Methods that implement PM4 packets */
-    void writeData(PM4Queue *q, PM4WriteData *pkt);
+    void writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header);
     void writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr);
     void mapQueues(PM4Queue *q, PM4MapQueues *pkt);
     void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
     void doneMQDWrite(Addr mqdAddr, Addr addr);
     void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases);
-    void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt);
-    void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt);
+    void mapProcessV1(PM4Queue *q, PM4MapProcess *pkt);
+    void mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt);
     void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd,
                     uint16_t vmid);
     void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
@@ -188,6 +192,9 @@ class PM4PacketProcessor : public DmaVirtDevice
     void setRbDoorbellCntrl(uint32_t data);
     void setRbDoorbellRangeLo(uint32_t data);
     void setRbDoorbellRangeHi(uint32_t data);
+
+    int getIpId() const { return _ipId; }
+    AddrRange getMMIORange() const { return _mmioRange; }
 };
 
 } // namespace gem5
diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index 0202f583e6..e1e123df48 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -81,9 +81,9 @@ SDMAEngine::setGPUDevice(AMDGPUDevice *gpu_device)
 }
 
 int
-SDMAEngine::getIHClientId()
+SDMAEngine::getIHClientId(int _id)
 {
-    switch (id) {
+    switch (_id) {
       case 0:
         return SOC15_IH_CLIENTID_SDMA0;
       case 1:
@@ -184,6 +184,7 @@ SDMAEngine::registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd)
     Addr rptr_wb_addr = mqd->sdmax_rlcx_rb_rptr_addr_hi;
     rptr_wb_addr <<= 32;
     rptr_wb_addr |= mqd->sdmax_rlcx_rb_rptr_addr_lo;
+    bool priv = bits(mqd->sdmax_rlcx_rb_cntl, 23, 23);
 
     // Get first free RLC
     if (!rlc0.valid()) {
@@ -199,6 +200,7 @@ SDMAEngine::registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd)
         rlc0.processing(false);
         rlc0.setMQD(mqd);
         rlc0.setMQDAddr(mqdAddr);
+        rlc0.setPriv(priv);
     } else if (!rlc1.valid()) {
         DPRINTF(SDMAEngine, "Doorbell %lx mapped to RLC1\n", doorbell);
         rlcInfo[1] = doorbell;
@@ -212,6 +214,7 @@ SDMAEngine::registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd)
         rlc1.processing(false);
         rlc1.setMQD(mqd);
         rlc1.setMQDAddr(mqdAddr);
+        rlc1.setPriv(priv);
     } else {
         panic("No free RLCs. Check they are properly unmapped.");
     }
@@ -266,6 +269,7 @@ SDMAEngine::deallocateRLCQueues()
     for (auto doorbell: rlcInfo) {
         if (doorbell) {
             unregisterRLCQueue(doorbell);
+            gpuDevice->unsetDoorbell(doorbell);
         }
     }
 }
@@ -386,7 +390,15 @@ SDMAEngine::decodeHeader(SDMAQueue *q, uint32_t header)
       case SDMA_OP_NOP: {
         uint32_t NOP_count = (header >> 16) & 0x3FFF;
         DPRINTF(SDMAEngine, "SDMA NOP packet with count %d\n", NOP_count);
-        if (NOP_count > 0) q->incRptr(NOP_count * 4);
+        if (NOP_count > 0) {
+            for (int i = 0; i < NOP_count; ++i) {
+                if (q->rptr() == q->wptr()) {
+                    warn("NOP count is beyond wptr, ignoring remaining NOPs");
+                    break;
+                }
+                q->incRptr(4);
+            }
+        }
         decodeNext(q);
         } break;
       case SDMA_OP_COPY: {
@@ -616,14 +628,19 @@ SDMAEngine::writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)
 
     // lastly we write read data to the destination address
     if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
-        Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+        Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+
+        fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr),
+                "SDMA write to GART not implemented");
+
         auto cb = new EventFunctionWrapper(
             [ = ]{ writeDone(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
+        gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer,
                                            bufferSize, 0, cb);
     } else {
-        // TODO: getGARTAddr?
-        pkt->dest = getGARTAddr(pkt->dest);
+        if (q->priv()) {
+            pkt->dest = getGARTAddr(pkt->dest);
+        }
         auto cb = new DmaVirtCallback<uint32_t>(
             [ = ] (const uint64_t &) { writeDone(q, pkt, dmaBuffer); });
         dmaWriteVirt(pkt->dest, bufferSize, cb, (void *)dmaBuffer);
@@ -650,9 +667,13 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
     q->incRptr(sizeof(sdmaCopy));
     // count represents the number of bytes - 1 to be copied
     pkt->count++;
-    DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source);
-    pkt->source = getGARTAddr(pkt->source);
-    DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source);
+    if (q->priv()) {
+        if (!gpuDevice->getVM().inMMHUB(pkt->source)) {
+            DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source);
+            pkt->source = getGARTAddr(pkt->source);
+            DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source);
+        }
+    }
 
     // Read data from the source first, then call the copyReadData method
     uint8_t *dmaBuffer = new uint8_t[pkt->count];
@@ -728,6 +749,19 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
             [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });
         dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer);
     }
+
+    // For destinations in the GART table, gem5 uses a mapping tables instead
+    // of functionally going to device memory, so we need to update that copy.
+    if (gpuDevice->getVM().inGARTRange(device_addr)) {
+        // GART entries are always 8 bytes.
+        assert((pkt->count % 8) == 0);
+        for (int i = 0; i < pkt->count/8; ++i) {
+            Addr gart_addr = device_addr + i*8 - gpuDevice->getVM().gartBase();
+            DPRINTF(SDMAEngine, "Shadow copying to GART table %lx -> %lx\n",
+                    gart_addr, dmaBuffer64[i]);
+            gpuDevice->getVM().gartTable[gart_addr] = dmaBuffer64[i];
+        }
+    }
 }
 
 /* Completion of a copy packet. */
@@ -745,7 +779,11 @@ SDMAEngine::copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
 void
 SDMAEngine::indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt)
 {
-    q->ib()->base(getGARTAddr(pkt->base));
+    if (q->priv()) {
+        q->ib()->base(getGARTAddr(pkt->base));
+    } else {
+        q->ib()->base(pkt->base);
+    }
     q->ib()->rptr(0);
     q->ib()->size(pkt->size * sizeof(uint32_t) + 1);
     q->ib()->setWptr(pkt->size * sizeof(uint32_t));
@@ -761,7 +799,9 @@ void
 SDMAEngine::fence(SDMAQueue *q, sdmaFence *pkt)
 {
     q->incRptr(sizeof(sdmaFence));
-    pkt->dest = getGARTAddr(pkt->dest);
+    if (q->priv()) {
+        pkt->dest = getGARTAddr(pkt->dest);
+    }
 
     // Writing the data from the fence packet to the destination address.
     auto cb = new DmaVirtCallback<uint32_t>(
@@ -789,8 +829,12 @@ SDMAEngine::trap(SDMAQueue *q, sdmaTrap *pkt)
 
     uint32_t ring_id = (q->queueType() == SDMAPage) ? 3 : 0;
 
+    int node_id = 0;
+    int local_id = getId();
+
     gpuDevice->getIH()->prepareInterruptCookie(pkt->intrContext, ring_id,
-                                               getIHClientId(), TRAP_ID);
+                                               getIHClientId(local_id),
+                                               TRAP_ID, 2*node_id);
     gpuDevice->getIH()->submitInterruptCookie();
 
     delete pkt;
@@ -816,8 +860,7 @@ SDMAEngine::srbmWrite(SDMAQueue *q, sdmaSRBMWriteHeader *header,
     DPRINTF(SDMAEngine, "SRBM write to %#x with data %#x\n",
             reg_addr, pkt->data);
 
-    warn_once("SRBM write not performed, no SRBM model. This needs to be fixed"
-              " if correct system simulation is relying on SRBM registers.");
+    gpuDevice->setRegVal(reg_addr, pkt->data);
 
     delete header;
     delete pkt;
@@ -947,13 +990,20 @@ SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt)
 
     // Writing generated data to the destination address.
     if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
-        Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+        Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+
+        fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr),
+                "SDMA write to GART not implemented");
+
         auto cb = new EventFunctionWrapper(
             [ = ]{ ptePdeDone(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
+        gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer,
                                              sizeof(uint64_t) * pkt->count, 0,
                                              cb);
     } else {
+        if (q->priv()) {
+            pkt->dest = getGARTAddr(pkt->dest);
+        }
         auto cb = new DmaVirtCallback<uint64_t>(
             [ = ] (const uint64_t &) { ptePdeDone(q, pkt, dmaBuffer); });
         dmaWriteVirt(pkt->dest, sizeof(uint64_t) * pkt->count, cb,
@@ -1086,7 +1136,7 @@ SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data)
 {
     DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr);
 
-    delete fill_data;
+    delete [] fill_data;
     delete pkt;
     decodeNext(q);
 }
diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh
index 5abe63fcc6..9407b97d73 100644
--- a/src/dev/amdgpu/sdma_engine.hh
+++ b/src/dev/amdgpu/sdma_engine.hh
@@ -68,6 +68,7 @@ class SDMAEngine : public DmaVirtDevice
         SDMAType _type;
         SDMAQueueDesc *_mqd;
         Addr _mqd_addr = 0;
+        bool _priv = true; // Only used for RLC queues. True otherwise.
       public:
         SDMAQueue() : _rptr(0), _wptr(0), _valid(false), _processing(false),
             _parent(nullptr), _ib(nullptr), _type(SDMAGfx), _mqd(nullptr) {}
@@ -87,6 +88,7 @@ class SDMAEngine : public DmaVirtDevice
         SDMAType queueType() { return _type; }
         SDMAQueueDesc* getMQD() { return _mqd; }
         Addr getMQDAddr() { return _mqd_addr; }
+        bool priv() { return _priv; }
 
         void base(Addr value) { _base = value; }
 
@@ -121,6 +123,7 @@ class SDMAEngine : public DmaVirtDevice
         void queueType(SDMAType type) { _type = type; }
         void setMQD(SDMAQueueDesc *mqd) { _mqd = mqd; }
         void setMQDAddr(Addr mqdAddr) { _mqd_addr = mqdAddr; }
+        void setPriv(bool priv) { _priv = priv; }
     };
 
     /* SDMA Engine ID */
@@ -169,7 +172,7 @@ class SDMAEngine : public DmaVirtDevice
     /**
      * Returns the client id for the Interrupt Handler.
      */
-    int getIHClientId();
+    int getIHClientId(int _id);
 
     /**
      * Methods for translation.
diff --git a/src/dev/arm/Gic.py b/src/dev/arm/Gic.py
index 6cb03c8cb9..8b2a564564 100644
--- a/src/dev/arm/Gic.py
+++ b/src/dev/arm/Gic.py
@@ -155,6 +155,7 @@ class ArmSigInterruptPin(ArmInterruptPin):
     cxx_class = "gem5::ArmSigInterruptPinGen"
 
     irq = IntSourcePin("Interrupt pin")
+    platform = NULL
 
 
 class GicV2(BaseGic):
diff --git a/src/dev/arm/Mpam.py b/src/dev/arm/Mpam.py
new file mode 100644
index 0000000000..d157493d25
--- /dev/null
+++ b/src/dev/arm/Mpam.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024 Arm Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.objects.PartitioningPolicies import PartitionManager
+
+
+class MpamMSC(PartitionManager):
+    type = "MpamMSC"
+    cxx_header = "dev/arm/mpam.hh"
+    cxx_class = "gem5::mpam::MSC"
diff --git a/src/dev/arm/RealView.py b/src/dev/arm/RealView.py
index a0540ec50c..bff79c15df 100644
--- a/src/dev/arm/RealView.py
+++ b/src/dev/arm/RealView.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2009-2022 Arm Limited
+# Copyright (c) 2009-2022, 2024 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -1283,6 +1283,7 @@ class VExpress_GEM5_Base(RealView):
              95    : HDLCD
              96- 98: GPU (reserved)
             100-103: PCI
+            106    : SMMU event queue
             130    : System Watchdog (SP805)
        256-319: MSI frame 0 (gem5-specific, SPIs)
        320-511: Unused
@@ -1508,7 +1509,10 @@ class VExpress_GEM5_Base(RealView):
         if hasattr(self, "smmu"):
             m5.fatal("A SMMU has already been instantiated\n")
 
-        self.smmu = SMMUv3(reg_map=AddrRange(0x2B400000, size=0x00020000))
+        self.smmu = SMMUv3(
+            reg_map=AddrRange(0x2B400000, size=0x00020000),
+            eventq_irq=ArmSPI(num=106),
+        )
 
         self.smmu.request = bus.cpu_side_ports
         self.smmu.control = bus.mem_side_ports
diff --git a/src/dev/arm/SConscript b/src/dev/arm/SConscript
index 0a68e480f3..f4bbc1266a 100644
--- a/src/dev/arm/SConscript
+++ b/src/dev/arm/SConscript
@@ -49,6 +49,7 @@ SimObject('Gic.py', sim_objects=[
     'BaseGic', 'ArmInterruptPin', 'ArmSPI', 'ArmPPI', 'ArmSigInterruptPin',
     'GicV2', 'Gicv2mFrame', 'Gicv2m', 'VGic', 'Gicv3Its', 'Gicv3'],
     enums=['ArmInterruptType'], tags='arm isa')
+SimObject('Mpam.py', sim_objects=['MpamMSC'], tags='arm isa')
 SimObject('RealView.py', sim_objects=[
     'AmbaPioDevice', 'AmbaIntDevice', 'AmbaDmaDevice', 'A9SCU',
     'GenericArmPciHost', 'RealViewCtrl', 'RealViewOsc',
@@ -80,6 +81,7 @@ Source('gic_v3_cpu_interface.cc', tags='arm isa')
 Source('gic_v3_distributor.cc', tags='arm isa')
 Source('gic_v3_redistributor.cc', tags='arm isa')
 Source('gic_v3_its.cc', tags='arm isa')
+Source('mpam.cc', tags='arm isa')
 Source('pl011.cc', tags='arm isa')
 Source('pl111.cc', tags='arm isa')
 Source('hdlcd.cc', tags='arm isa')
@@ -87,6 +89,7 @@ Source('kmi.cc', tags='arm isa')
 Source('smmu_v3.cc', tags='arm isa');
 Source('smmu_v3_caches.cc', tags='arm isa');
 Source('smmu_v3_cmdexec.cc', tags='arm isa');
+Source('smmu_v3_defs.cc', tags='arm isa');
 Source('smmu_v3_events.cc', tags='arm isa');
 Source('smmu_v3_ports.cc', tags='arm isa');
 Source('smmu_v3_proc.cc', tags='arm isa');
diff --git a/src/dev/arm/SMMUv3.py b/src/dev/arm/SMMUv3.py
index 28c2c6fe24..27ff564705 100644
--- a/src/dev/arm/SMMUv3.py
+++ b/src/dev/arm/SMMUv3.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2013, 2018-2020 ARM Limited
+# Copyright (c) 2013, 2018-2020, 2024 Arm Limited
 # All rights reserved
 #
 # The license below extends only to copyright in the software and shall
@@ -100,13 +100,6 @@ class SMMUv3(ClockedObject):
     reg_map = Param.AddrRange("Address range for control registers")
     system = Param.System(Parent.any, "System this device is part of")
 
-    irq_interface_enable = Param.Bool(
-        False,
-        "This flag enables software to program SMMU_IRQ_CTRL and "
-        "SMMU_IRQ_CTRLACK as if the model implemented architectural "
-        "interrupt sources",
-    )
-
     device_interfaces = VectorParam.SMMUv3DeviceInterface(
         [], "Responder interfaces"
     )
@@ -200,6 +193,13 @@ class SMMUv3(ClockedObject):
     # [7:0] (0 = SMMUv3.0) (1 = SMMUv3.1)
     smmu_aidr = Param.UInt32(0, "SMMU_AIDR register")
 
+    eventq_irq = Param.ArmSPI(
+        NULL,
+        "Event Queue Interrupt. If set to NULL it means a wired "
+        "implementation of the interrupt is not supported. "
+        "In that case MSIs should be used",
+    )
+
     def generateDeviceTree(self, state):
         reg_addr = self.reg_map.start
         reg_size = self.reg_map.size()
@@ -210,6 +210,22 @@ class SMMUv3(ClockedObject):
                 "reg", state.addrCells(reg_addr) + state.sizeCells(reg_size)
             )
         )
+
+        gic = self._parent.unproxy(self).gic
+
+        wired_interrupts = []
+        if self.eventq_irq != NULL:
+            wired_interrupts += self.eventq_irq.generateFdtProperty(gic)
+
+        if wired_interrupts:
+            node.append(FdtPropertyWords("interrupts", wired_interrupts))
+            node.append(
+                FdtPropertyStrings(
+                    "interrupt-names",
+                    ["eventq"],
+                )
+            )
+
         node.append(FdtPropertyWords("#iommu-cells", [1]))
 
         node.appendPhandle(self)
diff --git a/src/dev/arm/base_gic.cc b/src/dev/arm/base_gic.cc
index 5694c37399..59c3aae17b 100644
--- a/src/dev/arm/base_gic.cc
+++ b/src/dev/arm/base_gic.cc
@@ -154,7 +154,6 @@ ArmInterruptPin::ArmInterruptPin(
       : threadContext(tc), platform(dynamic_cast<RealView*>(p.platform)),
         intNum(p.num), triggerType(p.int_type), _active(false)
 {
-    fatal_if(!platform, "Interrupt not connected to a RealView platform");
 }
 
 void
@@ -190,6 +189,7 @@ ArmSPI::ArmSPI(
     const ArmSPIParams &p)
       : ArmInterruptPin(p, nullptr)
 {
+    fatal_if(!platform, "Interrupt not connected to a RealView platform");
 }
 
 void
@@ -210,6 +210,7 @@ ArmPPI::ArmPPI(
     const ArmPPIParams &p, ThreadContext *tc)
       : ArmInterruptPin(p, tc)
 {
+    fatal_if(!platform, "Interrupt not connected to a RealView platform");
 }
 
 void
diff --git a/src/dev/arm/mpam.cc b/src/dev/arm/mpam.cc
new file mode 100644
index 0000000000..c530151c6f
--- /dev/null
+++ b/src/dev/arm/mpam.cc
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2024 Arm Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/arm/mpam.hh"
+#include "dev/arm/mpam.hh"
+
+namespace gem5::mpam
+{
+
+uint64_t
+MSC::readPacketPartitionID(PacketPtr pkt) const
+{
+    // get partition_id from PartitionFieldExtension
+    auto ext = pkt->req->getExtension<ArmISA::mpam::PartitionFieldExtension>();
+    // use default value if extension is not set
+    return (ext != nullptr) ? ext->getPartitionID() :
+        ArmISA::mpam::DEFAULT_PARTITION_ID;
+}
+
+} // namespace gem5::mpam
diff --git a/src/dev/arm/mpam.hh b/src/dev/arm/mpam.hh
new file mode 100644
index 0000000000..e85bca6a2e
--- /dev/null
+++ b/src/dev/arm/mpam.hh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024 Arm Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DEV_ARM_MPAM_HH__
+#define __DEV_ARM_MPAM_HH__
+
+#include "mem/cache/tags/partitioning_policies/partition_manager.hh"
+
+namespace gem5::mpam
+{
+
+/**
+ * This class implements a simple MPAM Memory System Component (MSC)
+ * partitioning controller. For further info refer to:
+ * https://developer.arm.com/documentation/ddi0598/latest/
+ */
+class MSC : public partitioning_policy::PartitionManager
+{
+  public:
+    using partitioning_policy::PartitionManager::PartitionManager;
+
+    /**
+    * Helper function to retrieve PartitionID from a packet; Returns packet
+    * PartitionID if available or DEFAULT_PARTITION_ID if extention is not set
+    * @param pkt pointer to packet (PacketPtr)
+    * @return packet PartitionID.
+    */
+    uint64_t readPacketPartitionID(PacketPtr pkt) const override;
+};
+
+} // namespace gem5::mpam
+
+#endif // __DEV_ARM_MPAM_HH__
diff --git a/src/dev/arm/smmu_v3.cc b/src/dev/arm/smmu_v3.cc
index 8ce8bd92b2..4d06a8cd79 100644
--- a/src/dev/arm/smmu_v3.cc
+++ b/src/dev/arm/smmu_v3.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2018-2020 ARM Limited
+ * Copyright (c) 2013, 2018-2020, 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -48,6 +48,7 @@
 #include "base/types.hh"
 #include "debug/Checkpoint.hh"
 #include "debug/SMMUv3.hh"
+#include "dev/arm/base_gic.hh"
 #include "dev/arm/smmu_v3_transl.hh"
 #include "mem/packet_access.hh"
 #include "sim/system.hh"
@@ -62,7 +63,7 @@ SMMUv3::SMMUv3(const SMMUv3Params &params) :
     requestPort(name() + ".request", *this),
     tableWalkPort(name() + ".walker", *this),
     controlPort(name() + ".control", *this, params.reg_map),
-    irqInterfaceEnable(params.irq_interface_enable),
+    eventqInterrupt(params.eventq_irq ? params.eventq_irq->get() : nullptr),
     tlb(params.tlb_entries, params.tlb_assoc, params.tlb_policy, this),
     configCache(params.cfg_entries, params.cfg_assoc, params.cfg_policy, this),
     ipaCache(params.ipa_entries, params.ipa_assoc, params.ipa_policy, this),
@@ -618,10 +619,9 @@ SMMUv3::writeControl(PacketPtr pkt)
             break;
         case offsetof(SMMURegs, irq_ctrl):
             assert(pkt->getSize() == sizeof(uint32_t));
-            if (irqInterfaceEnable) {
-                warn("SMMUv3::%s No support for interrupt sources", __func__);
-                regs.irq_ctrl = regs.irq_ctrlack = pkt->getLE<uint32_t>();
-            }
+            warn("SMMUv3::%s No support for GERROR and PRI interrupt sources",
+                 __func__);
+            regs.irq_ctrl = regs.irq_ctrlack = pkt->getLE<uint32_t>();
             break;
 
         case offsetof(SMMURegs, cr1):
diff --git a/src/dev/arm/smmu_v3.hh b/src/dev/arm/smmu_v3.hh
index 8721352c47..be8525f19d 100644
--- a/src/dev/arm/smmu_v3.hh
+++ b/src/dev/arm/smmu_v3.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2018-2020 ARM Limited
+ * Copyright (c) 2013, 2018-2020, 2024 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -79,6 +79,8 @@
 namespace gem5
 {
 
+class ArmInterruptPin;
+
 class SMMUTranslationProcess;
 
 class SMMUv3 : public ClockedObject
@@ -97,7 +99,9 @@ class SMMUv3 : public ClockedObject
     SMMUTableWalkPort tableWalkPort;
     SMMUControlPort   controlPort;
 
-    const bool irqInterfaceEnable;
+    // This could be nullptr if wired implementation of the
+    // event queue interrupt is not supported
+    ArmInterruptPin * const eventqInterrupt;
 
     ARMArchTLB  tlb;
     ConfigCache configCache;
diff --git a/src/dev/arm/smmu_v3_defs.cc b/src/dev/arm/smmu_v3_defs.cc
new file mode 100644
index 0000000000..cdf7546824
--- /dev/null
+++ b/src/dev/arm/smmu_v3_defs.cc
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024 Arm Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dev/arm/smmu_v3_defs.hh"
+
+namespace gem5
+{
+
+std::string
+SMMUEvent::print() const
+{
+    return csprintf("type=%#x sid=%#x ssid=%#x va=%#08x\n",
+        data.dw0.eventType, data.dw0.streamId, data.dw0.substreamId,
+        data.dw2.inputAddr);
+}
+
+} // namespace gem5
diff --git a/src/dev/arm/smmu_v3_defs.hh b/src/dev/arm/smmu_v3_defs.hh
index 8b7dfe02cf..c20343d9e0 100644
--- a/src/dev/arm/smmu_v3_defs.hh
+++ b/src/dev/arm/smmu_v3_defs.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2018-2019 ARM Limited
+ * Copyright (c) 2013, 2018-2019, 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -102,10 +102,42 @@ enum
     Q_BASE_ADDR_MASK   = 0x0000ffffffffffe0ULL,
     Q_BASE_SIZE_MASK   = 0x000000000000001fULL,
 
-    E_BASE_ENABLE_MASK = 0x8000000000000000ULL,
     E_BASE_ADDR_MASK   = 0x0000fffffffffffcULL,
 };
 
+BitUnion32(IDR0)
+    Bitfield<0> s2p;
+    Bitfield<1> s1p;
+    Bitfield<3, 2> ttf;
+    Bitfield<4> cohacc;
+    Bitfield<5> btm;
+    Bitfield<7, 6> httu;
+    Bitfield<8> dormhint;
+    Bitfield<9> hyp;
+    Bitfield<10> ats;
+    Bitfield<11> ns1ats;
+    Bitfield<12> asid16;
+    Bitfield<13> msi;
+    Bitfield<14> sev;
+    Bitfield<15> atos;
+    Bitfield<16> pri;
+    Bitfield<17> vmw;
+    Bitfield<18> vmid16;
+    Bitfield<19> cd2l;
+    Bitfield<20> vatos;
+    Bitfield<22, 21> ttEndian;
+    Bitfield<23> atsRecErr;
+    Bitfield<25, 24> stallModel;
+    Bitfield<26> termModel;
+    Bitfield<28, 27> stLevel;
+EndBitUnion(IDR0)
+
+BitUnion32(IRQCtrl)
+    Bitfield<0> gerrorIrqEn;
+    Bitfield<1> priqIrqEn;
+    Bitfield<2> eventqIrqEn;
+EndBitUnion(IRQCtrl)
+
 union SMMURegs
 {
     uint8_t data[SMMU_REG_SIZE];
@@ -384,25 +416,41 @@ struct SMMUCommand
     }
 };
 
-enum SMMUEventTypes
-{
-    EVT_FAULT = 0x0001,
-};
-
-enum SMMUEventFlags
-{
-    EVF_WRITE = 0x0001,
-};
-
 struct SMMUEvent
 {
-    uint16_t type;
-    uint16_t stag;
-    uint32_t flags;
-    uint32_t streamId;
-    uint32_t substreamId;
-    uint64_t va;
-    uint64_t ipa;
+    struct Data {
+        BitUnion64(DWORD0)
+            Bitfield<7, 0> eventType;
+            Bitfield<11> ssv;
+            Bitfield<31, 12> substreamId;
+            Bitfield<63, 32> streamId;
+        EndBitUnion(DWORD0)
+        DWORD0 dw0;
+
+        BitUnion64(DWORD1)
+            Bitfield<16, 0> stag;
+            Bitfield<33> pnu;
+            Bitfield<34> ind;
+            Bitfield<35> rnw;
+            Bitfield<38> nsipa;
+            Bitfield<39> s2;
+            Bitfield<41, 40> clss;
+        EndBitUnion(DWORD1)
+        DWORD1 dw1;
+
+        BitUnion64(DWORD2)
+            Bitfield<63, 0> inputAddr;
+        EndBitUnion(DWORD2)
+        DWORD2 dw2;
+
+        BitUnion64(DWORD3)
+            Bitfield<51, 3> fetchAddr;
+            Bitfield<51, 12> ipa;
+        EndBitUnion(DWORD3)
+        DWORD3 dw3;
+    } data;
+
+    std::string print() const;
 };
 
 enum
diff --git a/src/dev/arm/smmu_v3_transl.cc b/src/dev/arm/smmu_v3_transl.cc
index 85554f8bd0..2d519375b0 100644
--- a/src/dev/arm/smmu_v3_transl.cc
+++ b/src/dev/arm/smmu_v3_transl.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2018-2019, 2021 Arm Limited
+ * Copyright (c) 2013, 2018-2019, 2021, 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -41,6 +41,7 @@
 #include "debug/SMMUv3.hh"
 #include "debug/SMMUv3Hazard.hh"
 #include "dev/arm/amba.hh"
+#include "dev/arm/base_gic.hh"
 #include "dev/arm/smmu_v3.hh"
 #include "sim/system.hh"
 
@@ -184,7 +185,7 @@ SMMUTranslationProcess::main(Yield &yield)
 
         tr = smmuTranslation(yield);
 
-        if (tr.fault == FAULT_NONE)
+        if (!tr.isFaulting())
             ifcTLBUpdate(yield, tr);
 
         hazard4kRelease();
@@ -213,7 +214,7 @@ SMMUTranslationProcess::main(Yield &yield)
 
                 tr = smmuTranslation(yield);
 
-                if (tr.fault == FAULT_NONE) {
+                if (!tr.isFaulting()) {
                     ifcTLBUpdate(yield, tr);
 
                     issuePrefetch(next4k);
@@ -222,20 +223,18 @@ SMMUTranslationProcess::main(Yield &yield)
                 hazard4kRelease();
             }
 
-            if (tr.fault == FAULT_NONE)
+            if (!tr.isFaulting())
                 microTLBUpdate(yield, tr);
         }
 
         hazardIdHold(yield);
         hazardIdRelease();
 
-        if (tr.fault != FAULT_NONE)
-            panic("Translation Fault (addr=%#x, size=%#x, sid=%d, ssid=%d, "
-                    "isWrite=%d, isPrefetch=%d, isAtsRequest=%d)\n",
-                    request.addr, request.size, request.sid, request.ssid,
-                    request.isWrite, request.isPrefetch, request.isAtsRequest);
-
-        completeTransaction(yield, tr);
+        if (tr.isFaulting()) {
+            abortTransaction(yield, tr);
+        } else {
+            completeTransaction(yield, tr);
+        }
     }
 }
 
@@ -243,7 +242,7 @@ SMMUTranslationProcess::TranslResult
 SMMUTranslationProcess::bypass(Addr addr) const
 {
     TranslResult tr;
-    tr.fault = FAULT_NONE;
+    tr.fault = Fault(FAULT_NONE);
     tr.addr = addr;
     tr.addrMask = 0;
     tr.writable = 1;
@@ -297,7 +296,7 @@ SMMUTranslationProcess::smmuTranslation(Yield &yield)
         // Free PTW slot
         doSemaphoreUp(smmu.ptwSem);
 
-        if (tr.fault == FAULT_NONE)
+        if (!tr.isFaulting())
             smmuTLBUpdate(yield, tr);
     }
 
@@ -336,8 +335,8 @@ SMMUTranslationProcess::microTLBLookup(Yield &yield, TranslResult &tr)
         "micro TLB hit vaddr=%#x amask=%#x sid=%#x ssid=%#x paddr=%#x\n",
         request.addr, e->vaMask, request.sid, request.ssid, e->pa);
 
-    tr.fault = FAULT_NONE;
-    tr.addr = e->pa + (request.addr & ~e->vaMask);;
+    tr.fault    = Fault(FAULT_NONE);
+    tr.addr     = e->pa + (request.addr & ~e->vaMask);;
     tr.addrMask = e->vaMask;
     tr.writable = e->permissions;
 
@@ -370,7 +369,7 @@ SMMUTranslationProcess::ifcTLBLookup(Yield &yield, TranslResult &tr,
             "paddr=%#x\n", request.addr, e->vaMask, request.sid,
             request.ssid, e->pa);
 
-    tr.fault = FAULT_NONE;
+    tr.fault = Fault(FAULT_NONE);
     tr.addr = e->pa + (request.addr & ~e->vaMask);;
     tr.addrMask = e->vaMask;
     tr.writable = e->permissions;
@@ -402,7 +401,7 @@ SMMUTranslationProcess::smmuTLBLookup(Yield &yield, TranslResult &tr)
             "SMMU TLB hit vaddr=%#x amask=%#x asid=%#x vmid=%#x paddr=%#x\n",
             request.addr, e->vaMask, context.asid, context.vmid, e->pa);
 
-    tr.fault = FAULT_NONE;
+    tr.fault = Fault(FAULT_NONE);
     tr.addr = e->pa + (request.addr & ~e->vaMask);;
     tr.addrMask = e->vaMask;
     tr.writable = e->permissions;
@@ -414,7 +413,7 @@ void
 SMMUTranslationProcess::microTLBUpdate(Yield &yield,
                                        const TranslResult &tr)
 {
-    assert(tr.fault == FAULT_NONE);
+    assert(!tr.isFaulting());
 
     if (!ifc.microTLBEnable)
         return;
@@ -446,7 +445,7 @@ void
 SMMUTranslationProcess::ifcTLBUpdate(Yield &yield,
                                      const TranslResult &tr)
 {
-    assert(tr.fault == FAULT_NONE);
+    assert(!tr.isFaulting());
 
     if (!ifc.mainTLBEnable)
         return;
@@ -483,7 +482,7 @@ void
 SMMUTranslationProcess::smmuTLBUpdate(Yield &yield,
                                       const TranslResult &tr)
 {
-    assert(tr.fault == FAULT_NONE);
+    assert(!tr.isFaulting());
 
     if (!smmu.tlbEnable)
         return;
@@ -632,7 +631,7 @@ SMMUTranslationProcess::findConfig(Yield &yield,
     // Now fetch stage 1 config.
     if (context.stage1Enable) {
         ContextDescriptor cd;
-        doReadCD(yield, cd, ste, request.sid, request.ssid);
+        tr = doReadCD(yield, cd, ste, request.sid, request.ssid);
 
         tc.ttb0 = cd.dw1.ttb0 << CD_TTB_SHIFT;
         tc.ttb1 = cd.dw2.ttb1 << CD_TTB_SHIFT;
@@ -647,7 +646,7 @@ SMMUTranslationProcess::findConfig(Yield &yield,
         tc.t0sz = 0;
     }
 
-    return true;
+    return !tr.isFaulting();
 }
 
 void
@@ -767,7 +766,7 @@ SMMUTranslationProcess::walkStage1And2(Yield &yield, Addr addr,
             DPRINTF(SMMUv3, "S1 PTE not valid - fault\n");
 
             TranslResult tr;
-            tr.fault = FAULT_TRANSLATION;
+            tr.fault = Fault(FAULT_TRANSLATION, FaultClass::IN, false);
             return tr;
         }
 
@@ -777,7 +776,7 @@ SMMUTranslationProcess::walkStage1And2(Yield &yield, Addr addr,
             DPRINTF(SMMUv3, "S1 page not writable - fault\n");
 
             TranslResult tr;
-            tr.fault = FAULT_PERMISSION;
+            tr.fault = Fault(FAULT_PERMISSION, FaultClass::IN, false);
             return tr;
         }
 
@@ -788,7 +787,7 @@ SMMUTranslationProcess::walkStage1And2(Yield &yield, Addr addr,
 
         if (context.stage2Enable) {
             TranslResult s2tr = translateStage2(yield, walkPtr, false);
-            if (s2tr.fault != FAULT_NONE)
+            if (s2tr.isFaulting())
                 return s2tr;
 
             walkPtr = s2tr.addr;
@@ -799,15 +798,17 @@ SMMUTranslationProcess::walkStage1And2(Yield &yield, Addr addr,
     }
 
     TranslResult tr;
-    tr.fault    = FAULT_NONE;
+    tr.fault    = Fault(FAULT_NONE);
     tr.addrMask = pt_ops->pageMask(pte, level);
     tr.addr     = walkPtr + (addr & ~tr.addrMask);
     tr.writable = pt_ops->isWritable(pte, level, false);
 
     if (context.stage2Enable) {
         TranslResult s2tr = translateStage2(yield, tr.addr, true);
-        if (s2tr.fault != FAULT_NONE)
+        if (s2tr.isFaulting()) {
+            s2tr.fault.clss = FaultClass::IN;
             return s2tr;
+        }
 
         tr = combineTranslations(tr, s2tr);
     }
@@ -852,7 +853,7 @@ SMMUTranslationProcess::walkStage2(Yield &yield, Addr addr, bool final_tr,
             DPRINTF(SMMUv3, "  S2 PTE not valid - fault\n");
 
             TranslResult tr;
-            tr.fault = FAULT_TRANSLATION;
+            tr.fault = Fault(FAULT_TRANSLATION, FaultClass::TT, true, addr);
             return tr;
         }
 
@@ -862,7 +863,7 @@ SMMUTranslationProcess::walkStage2(Yield &yield, Addr addr, bool final_tr,
             DPRINTF(SMMUv3, "  S2 PTE not writable = fault\n");
 
             TranslResult tr;
-            tr.fault = FAULT_PERMISSION;
+            tr.fault = Fault(FAULT_PERMISSION, FaultClass::TT, true, addr);
             return tr;
         }
 
@@ -877,7 +878,7 @@ SMMUTranslationProcess::walkStage2(Yield &yield, Addr addr, bool final_tr,
     }
 
     TranslResult tr;
-    tr.fault    = FAULT_NONE;
+    tr.fault    = Fault(FAULT_NONE);
     tr.addrMask = pt_ops->pageMask(pte, level);
     tr.addr     = walkPtr + (addr & ~tr.addrMask);
     tr.writable = pt_ops->isWritable(pte, level, true);
@@ -913,7 +914,7 @@ SMMUTranslationProcess::translateStage1And2(Yield &yield, Addr addr)
     TranslResult tr;
     if (walk_ep) {
         if (walk_ep->leaf) {
-            tr.fault    = FAULT_NONE;
+            tr.fault    = Fault(FAULT_NONE);
             tr.addr     = walk_ep->pa + (addr & ~walk_ep->vaMask);
             tr.addrMask = walk_ep->vaMask;
             tr.writable = walk_ep->permissions;
@@ -924,8 +925,9 @@ SMMUTranslationProcess::translateStage1And2(Yield &yield, Addr addr)
         Addr table_addr = context.ttb0;
         if (context.stage2Enable) {
             TranslResult s2tr = translateStage2(yield, table_addr, false);
-            if (s2tr.fault != FAULT_NONE)
+            if (s2tr.isFaulting()) {
                 return s2tr;
+            }
 
             table_addr = s2tr.addr;
         }
@@ -935,7 +937,7 @@ SMMUTranslationProcess::translateStage1And2(Yield &yield, Addr addr)
                             table_addr);
     }
 
-    if (tr.fault == FAULT_NONE)
+    if (!tr.isFaulting())
         DPRINTF(SMMUv3, "Translated vaddr %#x to paddr %#x\n", addr, tr.addr);
 
     return tr;
@@ -957,7 +959,7 @@ SMMUTranslationProcess::translateStage2(Yield &yield, Addr addr, bool final_tr)
 
     if (ipa_ep) {
         TranslResult tr;
-        tr.fault    = FAULT_NONE;
+        tr.fault    = Fault(FAULT_NONE);
         tr.addr     = ipa_ep->pa + (addr & ~ipa_ep->ipaMask);
         tr.addrMask = ipa_ep->ipaMask;
         tr.writable = ipa_ep->permissions;
@@ -995,7 +997,7 @@ SMMUTranslationProcess::translateStage2(Yield &yield, Addr addr, bool final_tr)
     TranslResult tr;
     if (walk_ep) {
         if (walk_ep->leaf) {
-            tr.fault    = FAULT_NONE;
+            tr.fault    = Fault(FAULT_NONE);
             tr.addr     = walk_ep->pa + (addr & ~walk_ep->vaMask);
             tr.addrMask = walk_ep->vaMask;
             tr.writable = walk_ep->permissions;
@@ -1009,7 +1011,7 @@ SMMUTranslationProcess::translateStage2(Yield &yield, Addr addr, bool final_tr)
                         context.httb);
     }
 
-    if (tr.fault == FAULT_NONE)
+    if (!tr.isFaulting())
         DPRINTF(SMMUv3, "  Translated %saddr %#x to paddr %#x\n",
             context.stage1Enable ? "ip" : "v", addr, tr.addr);
 
@@ -1034,13 +1036,13 @@ SMMUTranslationProcess::TranslResult
 SMMUTranslationProcess::combineTranslations(const TranslResult &s1tr,
                                             const TranslResult &s2tr) const
 {
-    if (s2tr.fault != FAULT_NONE)
+    if (s2tr.isFaulting())
         return s2tr;
 
-    assert(s1tr.fault == FAULT_NONE);
+    assert(!s1tr.isFaulting());
 
     TranslResult tr;
-    tr.fault    = FAULT_NONE;
+    tr.fault    = Fault(FAULT_NONE);
     tr.addr     = s2tr.addr;
     tr.addrMask = s1tr.addrMask | s2tr.addrMask;
     tr.writable = s1tr.writable & s2tr.writable;
@@ -1229,11 +1231,51 @@ SMMUTranslationProcess::issuePrefetch(Addr addr)
     proc->scheduleWakeup(smmu.clockEdge(Cycles(1)));
 }
 
+void
+SMMUTranslationProcess::abortTransaction(Yield &yield,
+                                         const TranslResult &tr)
+{
+    DPRINTF(SMMUv3, "Translation Fault (addr=%#x, size=%#x, sid=%d, ssid=%d, "
+            "isWrite=%d, isPrefetch=%d, isAtsRequest=%d)\n",
+            request.addr, request.size, request.sid, request.ssid,
+            request.isWrite, request.isPrefetch, request.isAtsRequest);
+
+    // If eventq is not enabled, silently discard event
+    // TODO: Handle full queue (we are currently aborting
+    // in send event)
+    if (smmu.regs.cr0 & CR0_EVENTQEN_MASK) {
+        SMMUEvent event = generateEvent(tr);
+
+        sendEvent(yield, event);
+    }
+
+    ifc.xlateSlotsRemaining++;
+    smmu.scheduleDeviceRetries();
+
+    if (smmu.system.isAtomicMode()) {
+        request.pkt->makeAtomicResponse();
+    } else if (smmu.system.isTimingMode()) {
+        request.pkt->makeTimingResponse();
+    } else {
+        panic("Not in atomic or timing mode");
+    }
+
+    request.pkt->setBadAddress();
+
+    SMMUAction a;
+    // Send the bad address response to the client device
+    a.type = ACTION_SEND_RESP;
+    a.pkt = request.pkt;
+    a.ifc = &ifc;
+    a.delay = 0;
+    yield(a);
+}
+
 void
 SMMUTranslationProcess::completeTransaction(Yield &yield,
                                             const TranslResult &tr)
 {
-    assert(tr.fault == FAULT_NONE);
+    assert(!tr.isFaulting());
 
     unsigned numRequestorBeats = request.isWrite ?
         (request.size + (smmu.requestPortWidth-1))
@@ -1304,6 +1346,32 @@ SMMUTranslationProcess::completePrefetch(Yield &yield)
     yield(a);
 }
 
+SMMUEvent
+SMMUTranslationProcess::generateEvent(const TranslResult &tr)
+{
+    SMMUEvent event;
+    switch (tr.fault.type) {
+      case FAULT_PERMISSION:
+      case FAULT_TRANSLATION:
+        event.data.dw0.streamId = request.sid;
+        event.data.dw0.substreamId = request.ssid;
+        event.data.dw1.rnw = !request.isWrite;
+        event.data.dw2.inputAddr = request.addr;
+        event.data.dw1.s2 = tr.fault.stage2;
+        if (tr.fault.stage2) {
+            // Only support non-secure mode in the SMMU
+            event.data.dw1.nsipa = true;
+            event.data.dw3.ipa = tr.fault.ipa;
+        }
+        event.data.dw1.clss = tr.fault.clss;
+        break;
+      default:
+        panic("Unsupported fault: %d\n", tr.fault.type);
+    }
+
+    return event;
+}
+
 void
 SMMUTranslationProcess::sendEvent(Yield &yield, const SMMUEvent &ev)
 {
@@ -1315,23 +1383,45 @@ SMMUTranslationProcess::sendEvent(Yield &yield, const SMMUEvent &ev)
 
     Addr event_addr =
         (smmu.regs.eventq_base & Q_BASE_ADDR_MASK) +
-        (smmu.regs.eventq_prod & sizeMask) * sizeof(ev);
+        (smmu.regs.eventq_prod & sizeMask) * sizeof(ev.data);
 
-    DPRINTF(SMMUv3, "Sending event to addr=%#08x (pos=%d): type=%#x stag=%#x "
-        "flags=%#x sid=%#x ssid=%#x va=%#08x ipa=%#x\n",
-        event_addr, smmu.regs.eventq_prod, ev.type, ev.stag,
-        ev.flags, ev.streamId, ev.substreamId, ev.va, ev.ipa);
+    DPRINTF(SMMUv3, "Sending event to addr=%#08x (pos=%d): %s\n",
+        event_addr, smmu.regs.eventq_prod, ev.print());
+
+    bool empty_queue = (smmu.regs.eventq_prod & sizeMask) ==
+        (smmu.regs.eventq_cons & sizeMask);
 
     // This deliberately resets the overflow field in eventq_prod!
     smmu.regs.eventq_prod = (smmu.regs.eventq_prod + 1) & sizeMask;
 
-    doWrite(yield, event_addr, &ev, sizeof(ev));
+    doWrite(yield, event_addr, &ev.data, sizeof(ev.data));
 
-    if (!(smmu.regs.eventq_irq_cfg0 & E_BASE_ENABLE_MASK))
-        panic("eventq msi not enabled\n");
+    // Send an event queue interrupt when transitioning from empty to
+    // non empty queue
+    if (IRQCtrl irq_ctrl = smmu.regs.irq_ctrl;
+        irq_ctrl.eventqIrqEn && empty_queue) {
 
-    doWrite(yield, smmu.regs.eventq_irq_cfg0 & E_BASE_ADDR_MASK,
-            &smmu.regs.eventq_irq_cfg1, sizeof(smmu.regs.eventq_irq_cfg1));
+        sendEventInterrupt(yield);
+    }
+}
+
+void
+SMMUTranslationProcess::sendEventInterrupt(Yield &yield)
+{
+    Addr msi_addr = smmu.regs.eventq_irq_cfg0 & E_BASE_ADDR_MASK;
+
+    // Check if MSIs are enabled by inspecting the SMMU_IDR.MSI bit
+    // According to the SMMUv3 spec, using an address equal to 0
+    // disables the sending of the MSI
+    if (IDR0 idr0 = smmu.regs.idr0; idr0.msi && msi_addr != 0) {
+        DPRINTF(SMMUv3, "Raise Event queue MSI\n");
+        doWrite(yield, msi_addr,
+                &smmu.regs.eventq_irq_cfg1, sizeof(smmu.regs.eventq_irq_cfg1));
+    }
+    if (smmu.eventqInterrupt) {
+        DPRINTF(SMMUv3, "Raise Event queue wired interrupt\n");
+        smmu.eventqInterrupt->raise();
+    }
 }
 
 void
@@ -1402,12 +1492,13 @@ SMMUTranslationProcess::doReadSTE(Yield &yield,
     smmu.stats.steFetches++;
 }
 
-void
+SMMUTranslationProcess::TranslResult
 SMMUTranslationProcess::doReadCD(Yield &yield,
                                  ContextDescriptor &cd,
                                  const StreamTableEntry &ste,
                                  uint32_t sid, uint32_t ssid)
 {
+    TranslResult tr;
     Addr cd_addr = 0;
 
     if (ste.dw0.s1cdmax == 0) {
@@ -1426,8 +1517,15 @@ SMMUTranslationProcess::doReadCD(Yield &yield,
             uint64_t l2_addr = (ste.dw0.s1ctxptr << ST_CD_ADDR_SHIFT) +
                 bits(ssid, 24, split) * sizeof(l2_ptr);
 
-            if (context.stage2Enable)
-                l2_addr = translateStage2(yield, l2_addr, false).addr;
+            if (context.stage2Enable) {
+                tr = translateStage2(yield, l2_addr, false);
+                if (tr.isFaulting()) {
+                    tr.fault.clss = FaultClass::CD;
+                    return tr;
+                }
+
+                l2_addr = tr.addr;
+            }
 
             DPRINTF(SMMUv3, "Read L1CD at %#x\n", l2_addr);
 
@@ -1443,8 +1541,15 @@ SMMUTranslationProcess::doReadCD(Yield &yield,
         }
     }
 
-    if (context.stage2Enable)
-        cd_addr = translateStage2(yield, cd_addr, false).addr;
+    if (context.stage2Enable) {
+        tr = translateStage2(yield, cd_addr, false);
+        if (tr.isFaulting()) {
+            tr.fault.clss = FaultClass::CD;
+            return tr;
+        }
+
+        cd_addr = tr.addr;
+    }
 
     DPRINTF(SMMUv3, "Read CD at %#x\n", cd_addr);
 
@@ -1464,6 +1569,7 @@ SMMUTranslationProcess::doReadCD(Yield &yield,
         panic("CD @ %#x not valid\n", cd_addr);
 
     smmu.stats.cdFetches++;
+    return tr;
 }
 
 void
diff --git a/src/dev/arm/smmu_v3_transl.hh b/src/dev/arm/smmu_v3_transl.hh
index 156d3e6fcc..1436b3cccb 100644
--- a/src/dev/arm/smmu_v3_transl.hh
+++ b/src/dev/arm/smmu_v3_transl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2018-2019 ARM Limited
+ * Copyright (c) 2013, 2018-2019, 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -83,19 +83,74 @@ class SMMUTranslationProcess : public SMMUProcess
         uint8_t s2t0sz;
     };
 
-    enum FaultType
+    enum FaultType : uint8_t
     {
         FAULT_NONE,
-        FAULT_TRANSLATION, // F_TRANSLATION
-        FAULT_PERMISSION,  // F_PERMISSION
+        FAULT_UUT = 0x1, // F_UUT = Unsupported Upstream Transaction
+        FAULT_BAD_STREAMID = 0x2, // C_BAD_STREAMID = Transaction streamID out of range
+        FAULT_STE_FETCH = 0x3, // F_STE_FETCH = Fetch of STE caused external abort
+        FAULT_BAD_STE = 0x4, // C_BAD_STE = Invalid STE
+        FAULT_BAD_ATS_TREQ = 0x5, // F_BAD_ATS_TREQ
+        FAULT_STREAM_DISABLED = 0x6, // F_STREAM_DISABLED = Non-substream trans disabled
+        FAULT_TRANSL_FORBIDDEN = 0x7, // F_TRANSL_FORBIDDEN = SMMU bypass not allowed
+        FAULT_BAD_SUBSTREAMID = 0x8, // F_BAD_SUBSTREAMID = Bad substreamID
+        FAULT_CD_FETCH = 0x9, // F_CD_FETCH = Fetch of CD caused external abort
+        FAULT_BAD_CD = 0xa, // C_BAD_CD = Invalid CD
+        FAULT_WALK_EABT = 0xb, // F_WALK_EABT = Table walk/update caused external abort
+        FAULT_TRANSLATION = 0x10, // F_TRANSLATION = Translation Fault
+        FAULT_ADDR_SIZE = 0x11, // F_ADDR_SIZE = Address Size fault
+        FAULT_ACCESS = 0x12, // F_ACCESS = Access flag fault
+        FAULT_PERMISSION = 0x13, // F_PERMISSION = Permission fault
+        FAULT_TLB_CONFLICT = 0x20, // F_TLB_CONFLICT = TLB conflict
+        FAULT_CFG_CONFLICT = 0x21, // F_CFG_CONFLICT = Config cache conflict
+        FAULT_PAGE_REQUEST = 0x24, // E_PAGE_REQUEST
+        FAULT_VMS_FETCH = 0x25, // F_VMS_FETCH
+    };
+
+    /* The class of the operation that caused the fault */
+    enum FaultClass
+    {
+        CD = 0x0, // CD fetch
+        TT = 0x1, // Stage1 translation table fetch
+        IN = 0x2, // Input address caused fault
+        RESERVED = 0x3
+    };
+
+    struct Fault
+    {
+        explicit Fault(FaultType _type,
+                       FaultClass _clss=FaultClass::RESERVED,
+                       bool _stage2=false, Addr _ipa=0)
+          : type(_type), clss(_clss), stage2(_stage2), ipa(_ipa)
+        {}
+
+        Fault(const Fault &rhs) = default;
+        Fault& operator=(const Fault &rhs) = default;
+
+        bool isFaulting() const { return type != FAULT_NONE; }
+
+        FaultType type;
+        FaultClass clss;
+        bool stage2;
+        Addr ipa;
     };
 
     struct TranslResult
     {
-        FaultType  fault;
-        Addr       addr;
-        Addr       addrMask;
-        bool       writable;
+        TranslResult()
+          : fault(FaultType::FAULT_NONE),
+            addr(0), addrMask(0), writable(false)
+        {}
+
+        TranslResult(const TranslResult&) = default;
+        TranslResult& operator=(const TranslResult &rhs) = default;
+
+        bool isFaulting() const { return fault.isFaulting(); }
+
+        Fault fault;
+        Addr  addr;
+        Addr  addrMask;
+        bool  writable;
     };
 
     SMMUv3DeviceInterface &ifc;
@@ -166,14 +221,17 @@ class SMMUTranslationProcess : public SMMUProcess
 
     void issuePrefetch(Addr addr);
 
+    void abortTransaction(Yield &yield, const TranslResult &tr);
     void completeTransaction(Yield &yield, const TranslResult &tr);
     void completePrefetch(Yield &yield);
 
+    SMMUEvent generateEvent(const TranslResult &tr);
     void sendEvent(Yield &yield, const SMMUEvent &ev);
+    void sendEventInterrupt(Yield &yield);
 
     void doReadSTE(Yield &yield, StreamTableEntry &ste, uint32_t sid);
-    void doReadCD(Yield &yield, ContextDescriptor &cd,
-                  const StreamTableEntry &ste, uint32_t sid, uint32_t ssid);
+    TranslResult doReadCD(Yield &yield, ContextDescriptor &cd,
+                          const StreamTableEntry &ste, uint32_t sid, uint32_t ssid);
     void doReadConfig(Yield &yield, Addr addr, void *ptr, size_t size,
                       uint32_t sid, uint32_t ssid);
     void doReadPTE(Yield &yield, Addr va, Addr addr, void *ptr,
diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh
index b72092538d..d70b964ba2 100644
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -115,28 +115,13 @@ class HSAQueueDescriptor
                      10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
                      rocr/src/core/runtime/amd_aql_queue.cpp#L624
              *
-             * GFX7 and GFX8 will allocate twice as much space for their HSA
-             * queues as they actually access (using mod operations to map the
-             * virtual addresses from the upper half of the queue to the same
-             * virtual addresses as the lower half).  Thus, we need to check if
-             * the ISA is GFX8 and mod the address by half of the queue size if
-             * so.
              */
             uint64_t retAddr = 0ll;
-            if ((gfxVersion == GfxVersion::gfx801) ||
-                (gfxVersion == GfxVersion::gfx803)) {
-              retAddr = basePointer + ((ix % (numElts/2)) * objSize());
-              DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, "
-                      "index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, "
-                      "objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix,
-                      numElts, numElts/2, objSize(), retAddr);
-            } else {
-              retAddr = basePointer + ((ix % numElts) * objSize());
-              DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
-                      "index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
-                      "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
-                      retAddr);
-            }
+            retAddr = basePointer + ((ix % numElts) * objSize());
+            DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
+                    "index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
+                    "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
+                    retAddr);
             return retAddr;
         }
 };
diff --git a/src/dev/ps2/mouse.cc b/src/dev/ps2/mouse.cc
index f87e1a03e5..b37ee2e4c9 100644
--- a/src/dev/ps2/mouse.cc
+++ b/src/dev/ps2/mouse.cc
@@ -122,7 +122,9 @@ PS2Mouse::recv(const std::vector<uint8_t> &data)
       case mouse::ReadData:
         panic("Reading mouse data unimplemented.\n");
       case mouse::ResetWrapMode:
-        panic("Resetting mouse wrap mode unimplemented.\n");
+        DPRINTF(PS2, "Resetting Wrap Mode\n");
+        sendAck();
+        return true;
       case mouse::WrapMode:
         panic("Setting mouse wrap mode unimplemented.\n");
       case mouse::RemoteMode:
diff --git a/src/dev/reg_bank.hh b/src/dev/reg_bank.hh
index 9f53c44e38..fd67f7e4d7 100644
--- a/src/dev/reg_bank.hh
+++ b/src/dev/reg_bank.hh
@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2024 Arm Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright 2020 Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,6 +53,7 @@
 #include <sstream>
 #include <utility>
 
+#include "base/addr_range.hh"
 #include "base/bitfield.hh"
 #include "base/debug.hh"
 #include "base/logging.hh"
@@ -91,17 +104,26 @@
  * read only properties of the RegisterBank instance.
  *
  * To add actual registers to the RegisterBank (discussed below), you can use
- * either the addRegister method which adds a single register, or addRegisters
- * which adds an initializer list of them all at once. The register will be
- * appended to the end of the bank as they're added, contiguous to the
- * existing registers. The size of the bank is automatically accumulated as
- * registers are added.
+ * either the addRegister method which adds a single register, or
+ * addRegisters/addRegistersAt which add an initializer list of them all at
+ * once.
  *
- * When adding a lot of registers, you might accidentally add an extra,
- * or accidentally skip one in a long list. Because the offset is handled
- * automatically, some of your registers might end up shifted higher or lower
- * than you expect. To help mitigate this, you can set what offset you expect
- * a register to have by specifying it as an offset, register pair.
+ * For addRegister and addRegisters, the registers will be appended to
+ * the end of the bank as they're added, contiguous to the existing registers.
+ * The size of the bank is automatically accumulated as registers are added.
+ *
+ * For addRegistersAt, an offset field is used to instruct the bank where the
+ * register should be mapped. So the entries of the initializer list will be a
+ * set of offset-register pair.  The method is templated and the template
+ * parameter tells the bank which register type should be used to fill the
+ * remaining space. We make the RegBank the owner of this filler space
+ * (registers are generated internally within addRegistersAt).
+ *
+ * When adding a lot of registers with addRegisters, you might accidentally add
+ * an extra, or accidentally skip one in a long list. Because the offset is
+ * handled automatically, some of your registers might end up shifted higher or
+ * lower than you expect. To help mitigate this, you can set what offset you
+ * expect a register to have by specifying it as an offset, register pair.
  *
  * addRegisters({{0x1000, reg0}, reg1, reg2});
  *
@@ -887,6 +909,7 @@ class RegisterBank : public RegisterBankBase
     Addr _base = 0;
     Addr _size = 0;
     const std::string _name;
+    std::vector<std::unique_ptr<RegisterBase>> owned;
 
   public:
 
@@ -955,6 +978,47 @@ class RegisterBank : public RegisterBankBase
         }
     }
 
+    template <class FillerReg>
+    void
+    addRegistersAt(std::initializer_list<RegisterAdder> adders)
+    {
+        panic_if(std::empty(adders),
+                "Adding an empty list of registers to %s?", name());
+
+        std::vector<RegisterAdder> vec{adders};
+        std::sort(vec.begin(), vec.end(),
+            [] (const auto& first, const auto& second) {
+                return first.offset.value() < second.offset.value();
+            }
+        );
+
+        for (auto &adder: vec) {
+            assert(adder.offset && adder.reg);
+            const Addr offset = _base + _size;
+
+            // Here we check if there is a hole (gap) between the start of the
+            // new register and the end of the current register bank. A positive
+            // gap means we need to fill the hole with the provided filler.
+            // If gap is negative, it means previous register is overlapping
+            // with the start address of the current one, and we should panic
+            if (int gap = adder.offset.value() - offset; gap != 0) {
+                panic_if(gap < 0, "Overlapping register added to the bank: %s\n",
+                         adder.reg.value()->name());
+
+                // Use the filler register to fill the address range gap
+                AddrRange hole_range(offset, offset + gap);
+                owned.push_back(std::make_unique<FillerReg>(hole_range.to_string(), gap));
+                _offsetMap.emplace(offset, *owned.back().get());
+                _size += gap;
+            }
+
+            // Now insert the register at the specified offset.
+            auto *reg = adder.reg.value();
+            _offsetMap.emplace(adder.offset.value(), *reg);
+            _size += reg->size();
+        }
+    }
+
     void addRegister(RegisterAdder reg) { addRegisters({reg}); }
 
     Addr base() const { return _base; }
diff --git a/src/dev/reg_bank.test.cc b/src/dev/reg_bank.test.cc
index c618ef16d4..c799fd5b03 100644
--- a/src/dev/reg_bank.test.cc
+++ b/src/dev/reg_bank.test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited
+ * Copyright (c) 2020, 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -1100,6 +1100,93 @@ TEST_F(RegisterBankTest, AddRegistersWithOffsetChecks)
     EXPECT_EQ(emptyBank.size(), 12);
 }
 
+/**
+ * This test is using addRegistersAt method to store
+ * overlapping registers to the empty bank. This should not
+ * be permitted and the method should panic
+ *
+ *            [  reg2  ]
+ *       [  reg1  ]    |
+ *  [  reg0  ]    |    |
+ *  |        |    |    |
+ * 0x0      0x4  0x6  0x8
+ */
+TEST_F(RegisterBankTest, AddRegistersAtOffsetDeath)
+{
+    gtestLogOutput.str("");
+
+    auto base = emptyBank.base();
+    EXPECT_ANY_THROW(
+        emptyBank.addRegistersAt<RegisterBankLE::RegisterRao>(
+            {{base + 0x0, reg0},
+             {base + 0x2, reg1},
+             {base + 0x4, reg2}}));
+
+    std::string actual = gtestLogOutput.str();
+    EXPECT_THAT(actual, HasSubstr("Overlapping register"));
+    EXPECT_THAT(actual, HasSubstr("reg1"));
+}
+
+/**
+ * This test is using addRegistersAt method to store
+ * contiguous registers to the empty bank, similarly
+ * to what we would do when relying on addRegisters.
+ * The test will check size is updated consistently
+ * with the latter method
+ *
+ *  [  reg0  ][  reg1  ][  reg2  ]
+ *  |         |         |        |
+ * 0x0       0x4       0x8      0xc
+ */
+TEST_F(RegisterBankTest, AddRegistersAtOffsetContiguous)
+{
+    auto base = emptyBank.base();
+    EXPECT_EQ(emptyBank.size(), 0);
+    emptyBank.addRegistersAt<RegisterBankLE::RegisterRao>(
+        {{base + 0x0, reg0},
+         {base + 0x4, reg1},
+         {base + 0x8, reg2}});
+    EXPECT_EQ(emptyBank.size(), 0xc);
+}
+
+/**
+ * This test is using addRegistersAt method to store
+ * non-contiguous registers to the empty bank.
+ * As the RegisterRao data type is passed as a template
+ * argument, the gaps between the registers are filled
+ * with rao registers.
+ * We check raos are correctly inserted
+ *
+ *  [reg0][rao0][reg1][rao1][reg2]
+ *  |           |           |    |
+ * 0x0         0x8         0x10 0x14
+ */
+TEST_F(RegisterBankTest, AddRegistersAtOffsetSparse)
+{
+    auto base = emptyBank.base();
+    EXPECT_EQ(emptyBank.size(), 0);
+    emptyBank.addRegistersAt<RegisterBankLE::RegisterRao>(
+        {{base + 0x0, reg0},
+         {base + 0x8, reg1},
+         {base + 0x10, reg2}});
+    EXPECT_EQ(emptyBank.size(), 0x14);
+
+    emptyBank.read(base + 0x0, buf.data(), 4);
+    EXPECT_EQ(reg0.get(), *reinterpret_cast<uint32_t*>(buf.data()));
+
+    emptyBank.read(base + 0x4, buf.data(), 4);
+    EXPECT_EQ(0xffffffff, *reinterpret_cast<uint32_t*>(buf.data()));
+
+    emptyBank.read(base + 0x8, buf.data(), 4);
+    EXPECT_EQ(reg1.get(), *reinterpret_cast<uint32_t*>(buf.data()));
+
+    emptyBank.read(base + 0xc, buf.data(), 4);
+    EXPECT_EQ(0xffffffff, *reinterpret_cast<uint32_t*>(buf.data()));
+
+    emptyBank.read(base + 0x10, buf.data(), 4);
+    EXPECT_EQ(reg2.get(), *reinterpret_cast<uint32_t*>(buf.data()));
+}
+
 TEST_F(RegisterBankTest, BadRegisterOffsetDeath)
 {
     gtestLogOutput.str("");
diff --git a/src/dev/riscv/HiFive.py b/src/dev/riscv/HiFive.py
index 04b2672ea8..17c4b35a29 100755
--- a/src/dev/riscv/HiFive.py
+++ b/src/dev/riscv/HiFive.py
@@ -221,10 +221,10 @@ class HiFive(HiFiveBase):
         self.plic.n_src = max(plic_srcs) + 1
 
     def setNumCores(self, num_cpu):
-        """Sets the PLIC and CLINT to have the right number of threads and
-        contexts. Assumes that the cores have a single hardware thread.
+        """Sets the CLINT to number of threads and the PLIC hartID/pmode for
+        each contexts. Assumes that the cores have a single hardware thread.
         """
-        self.plic.n_contexts = num_cpu * 2
+        self.plic.hart_config = ",".join(["MS" for _ in range(num_cpu)])
         self.clint.num_threads = num_cpu
 
     def generateDeviceTree(self, state):
diff --git a/src/dev/riscv/Plic.py b/src/dev/riscv/Plic.py
index b4486b9350..ff80323d48 100644
--- a/src/dev/riscv/Plic.py
+++ b/src/dev/riscv/Plic.py
@@ -58,10 +58,8 @@ class PlicBase(BasicPioDevice):
 class Plic(PlicBase):
     """
     This implementation of PLIC is based on
-    the SiFive U54MC datasheet:
-    https://sifive.cdn.prismic.io/sifive/fab000f6-
-    0e07-48d0-9602-e437d5367806_sifive_U54MC_rtl_
-    full_20G1.03.00_manual.pdf
+    the riscv-plic-spec repository:
+    https://github.com/riscv/riscv-plic-spec/releases/tag/1.0.0
     """
 
     type = "Plic"
@@ -69,9 +67,20 @@ class Plic(PlicBase):
     cxx_class = "gem5::Plic"
     pio_size = 0x4000000
     n_src = Param.Int("Number of interrupt sources")
+    # Ref: https://github.com/qemu/qemu/blob/760b4dc/hw/intc/sifive_plic.c#L285
+    hart_config = Param.String(
+        "",
+        "String represent for PLIC hart/pmode config like QEMU plic"
+        "Ex."
+        "'M'              1 hart with M mode"
+        "'MS,MS'          2 harts, 0-1 with M and S mode"
+        "'M,MS,MS,MS,MS'  5 harts, 0 with M mode, 1-5 with M and S mode",
+    )
     n_contexts = Param.Int(
+        0,
+        "Deprecated, use `hart_config` instead. "
         "Number of interrupt contexts. Usually the number "
-        "of threads * 2. One for M mode, one for S mode"
+        "of threads * 2. One for M mode, one for S mode",
     )
 
     def generateDeviceTree(self, state):
@@ -89,12 +98,26 @@ class Plic(PlicBase):
 
         cpus = self.system.unproxy(self).cpu
         int_extended = list()
-        for cpu in cpus:
-            phandle = int_state.phandle(cpu)
-            int_extended.append(phandle)
-            int_extended.append(0xB)
-            int_extended.append(phandle)
-            int_extended.append(0x9)
+        if self.n_contexts != 0:
+            for cpu in cpus:
+                phandle = int_state.phandle(cpu)
+                int_extended.append(phandle)
+                int_extended.append(0xB)
+                int_extended.append(phandle)
+                int_extended.append(0x9)
+        elif self.hart_config != "":
+            cpu_id = 0
+            phandle = int_state.phandle(cpus[cpu_id])
+            for c in self.hart_config:
+                if c == ",":
+                    cpu_id += 1
+                    phandle = int_state.phandle(cpus[cpu_id])
+                elif c == "S":
+                    int_extended.append(phandle)
+                    int_extended.append(0x9)
+                elif c == "M":
+                    int_extended.append(phandle)
+                    int_extended.append(0xB)
 
         node.append(FdtPropertyWords("interrupts-extended", int_extended))
         node.append(FdtProperty("interrupt-controller"))
diff --git a/src/dev/riscv/plic.cc b/src/dev/riscv/plic.cc
index fd42920dc5..ca5acbd27b 100644
--- a/src/dev/riscv/plic.cc
+++ b/src/dev/riscv/plic.cc
@@ -57,10 +57,18 @@ Plic::Plic(const Params &params) :
     PlicBase(params),
     system(params.system),
     nSrc(params.n_src),
-    nContext(params.n_contexts),
     registers(params.name, pioAddr, this),
     update([this]{updateOutput();}, name() + ".update")
 {
+    fatal_if(params.hart_config != "" && params.n_contexts != 0,
+             "the hart_config and n_contexts can't be set simultaneously");
+
+    if (params.n_contexts != 0) {
+        initContextFromNContexts(params.n_contexts);
+    }
+    if (params.hart_config != "") {
+        initContextFromHartConfig(params.hart_config);
+    }
 }
 
 void
@@ -80,7 +88,7 @@ Plic::post(int src_id)
 
     // Update states
     pendingPriority[src_id] = registers.priority[src_id].get();
-    for (int i = 0; i < nContext; i++) {
+    for (int i = 0; i < contextConfigs.size(); i++) {
         bool enabled = bits(registers.enable[i][src_index].get(), src_offset);
         effPriority[i][src_id] = enabled ? pendingPriority[src_id] : 0;
     }
@@ -109,7 +117,7 @@ Plic::clear(int src_id)
 
     // Update states
     pendingPriority[src_id] = 0;
-    for (int i = 0; i < nContext; i++) {
+    for (int i = 0; i < contextConfigs.size(); i++) {
         effPriority[i][src_id] = 0;
     }
     DPRINTF(Plic,
@@ -174,20 +182,20 @@ Plic::init()
 
     // Setup internal states
     pendingPriority.resize(nSrc, 0x0);
-    for (int i = 0; i < nContext; i++) {
+    for (int i = 0; i < contextConfigs.size(); i++) {
         std::vector<uint32_t> context_priority(nSrc, 0x0);
         effPriority.push_back(context_priority);
     }
-    lastID.resize(nContext, 0x0);
+    lastID.resize(contextConfigs.size(), 0x0);
 
     // Setup outputs
     output = PlicOutput{
-        std::vector<uint32_t>(nContext, 0x0),
-        std::vector<uint32_t>(nContext, 0x0)};
+        std::vector<uint32_t>(contextConfigs.size(), 0x0),
+        std::vector<uint32_t>(contextConfigs.size(), 0x0)};
 
     DPRINTF(Plic,
         "Device init - %d contexts, %d sources, %d pending registers\n",
-        nContext, nSrc, nSrc32);
+        contextConfigs.size(), nSrc, nSrc32);
 
     BasicPioDevice::init();
 }
@@ -204,15 +212,15 @@ Plic::PlicRegisters::init()
         - plic->nSrc32 * 4;
     reserved.emplace_back("reserved1", reserve1_size);
     const size_t reserve2_size = thresholdStart - enableStart
-        - plic->nContext * enablePadding;
+        - plic->contextConfigs.size() * enablePadding;
     reserved.emplace_back("reserved2", reserve2_size);
     const size_t reserve3_size = plic->pioSize - thresholdStart
-        - plic->nContext * thresholdPadding;
+        - plic->contextConfigs.size() * thresholdPadding;
     reserved.emplace_back("reserved3", reserve3_size);
 
     // Sanity check
     assert(plic->pioSize >= thresholdStart
-        + plic->nContext * thresholdPadding);
+        + plic->contextConfigs.size() * thresholdPadding);
     assert((int) plic->pioSize <= maxBankSize);
 
     // Calculate hole sizes
@@ -228,7 +236,7 @@ Plic::PlicRegisters::init()
         pending.emplace_back(
             std::string("pending") + std::to_string(i), 0);
     }
-    for (int i = 0; i < plic->nContext; i++) {
+    for (int i = 0; i < plic->contextConfigs.size(); i++) {
 
         enable.push_back(std::vector<Register32>());
         for (int j = 0; j < plic->nSrc32; j++) {
@@ -264,7 +272,7 @@ Plic::PlicRegisters::init()
     addRegister(reserved[1]);
 
     // Enable
-    for (int i = 0; i < plic->nContext; i++) {
+    for (int i = 0; i < plic->contextConfigs.size(); i++) {
         for (int j = 0; j < plic->nSrc32; j++) {
             auto write_cb = std::bind(&Plic::writeEnable, plic, _1, _2, j, i);
             enable[i][j].writer(write_cb);
@@ -275,7 +283,7 @@ Plic::PlicRegisters::init()
     addRegister(reserved[2]);
 
     // Threshold and claim
-    for (int i = 0; i < plic->nContext; i++) {
+    for (int i = 0; i < plic->contextConfigs.size(); i++) {
         auto threshold_cb = std::bind(&Plic::writeThreshold, plic, _1, _2, i);
         threshold[i].writer(threshold_cb);
         auto read_cb = std::bind(&Plic::readClaim, plic, _1, i);
@@ -301,7 +309,7 @@ Plic::writePriority(Register32& reg, const uint32_t& data, const int src_id)
     // Update states
     bool pending = bits(registers.pending[src_index].get(), src_offset);
     pendingPriority[src_id] = pending ? reg.get() : 0;
-    for (int i = 0; i < nContext; i++) {
+    for (int i = 0; i < contextConfigs.size(); i++) {
         bool enabled = bits(
             registers.enable[i][src_index].get(), src_offset);
         effPriority[i][src_id] = enabled ? pendingPriority[src_id] : 0;
@@ -394,11 +402,11 @@ Plic::propagateOutput()
 {
     // Calculate new output
     PlicOutput new_output{
-        std::vector<uint32_t>(nContext, 0x0),
-        std::vector<uint32_t>(nContext, 0x0)};
+        std::vector<uint32_t>(contextConfigs.size(), 0x0),
+        std::vector<uint32_t>(contextConfigs.size(), 0x0)};
     uint32_t max_id;
     uint32_t max_priority;
-    for (int i = 0; i < nContext; i++) {
+    for (int i = 0; i < contextConfigs.size(); i++) {
         max_id = max_element(effPriority[i].begin(),
             effPriority[i].end()) - effPriority[i].begin();
         max_priority = effPriority[i][max_id];
@@ -421,6 +429,39 @@ Plic::propagateOutput()
     }
 }
 
+void
+Plic::initContextFromNContexts(int n_contexts)
+{
+    contextConfigs.reserve(n_contexts);
+    for (uint32_t i = 0; i < (uint32_t)n_contexts; i += 2) {
+      contextConfigs.emplace_back((i >> 1), ExceptionCode::INT_EXT_MACHINE);
+      contextConfigs.emplace_back((i >> 1), ExceptionCode::INT_EXT_SUPER);
+    }
+}
+
+void
+Plic::initContextFromHartConfig(const std::string& hart_config)
+{
+    contextConfigs.reserve(hart_config.size());
+    uint32_t hart_id = 0;
+    for (char c: hart_config) {
+      switch (c) {
+        case ',':
+          hart_id++;
+          break;
+        case 'M':
+          contextConfigs.emplace_back(hart_id, ExceptionCode::INT_EXT_MACHINE);
+          break;
+        case 'S':
+          contextConfigs.emplace_back(hart_id, ExceptionCode::INT_EXT_SUPER);
+          break;
+        default:
+          fatal("hart_config should not contains the value: %c", c);
+          break;
+      }
+    }
+}
+
 void
 Plic::updateOutput()
 {
@@ -443,10 +484,8 @@ void
 Plic::updateInt()
 {
     // Update xEIP lines
-    for (int i = 0; i < nContext; i++) {
-        int thread_id = i >> 1;
-        int int_id = (i & 1) ?
-            ExceptionCode::INT_EXT_SUPER : ExceptionCode::INT_EXT_MACHINE;
+    for (int i = 0; i < contextConfigs.size(); i++) {
+        auto [thread_id, int_id] = contextConfigs[i];
 
         auto tc = system->threads[thread_id];
         uint32_t max_id = output.maxID[i];
diff --git a/src/dev/riscv/plic.hh b/src/dev/riscv/plic.hh
index 00128ee56c..ef5eee8df8 100644
--- a/src/dev/riscv/plic.hh
+++ b/src/dev/riscv/plic.hh
@@ -57,11 +57,9 @@ namespace gem5
 using namespace RiscvISA;
 /**
  * NOTE:
- * This implementation of CLINT is based on
- * the SiFive U54MC datasheet:
- * https://sifive.cdn.prismic.io/sifive/fab000f6-
- * 0e07-48d0-9602-e437d5367806_sifive_U54MC_rtl_
- * full_20G1.03.00_manual.pdf
+ * This implementation of PLIC is based on
+ * he riscv-plic-spec repository:
+ * https://github.com/riscv/riscv-plic-spec/releases/tag/1.0.0
  */
 
 /**
@@ -124,13 +122,9 @@ class Plic : public PlicBase
      */
     int nSrc32;
     /**
-     * Number of interrupt contexts
-     * = nThread * 2
-     * e.g. context 0 => thread 0 M mode
-     *      context 1 => thread 0 S mode
-     * This is based on SiFive U54MC datasheet
+     * PLIC hart/pmode address configs, stored in the format {hartID, pmode}
      */
-    int nContext;
+    std::vector<std::pair<uint32_t, ExceptionCode>> contextConfigs;
 
   public:
     typedef PlicParams Params;
@@ -261,6 +255,12 @@ class Plic : public PlicBase
     std::vector<uint32_t> lastID;
     PlicOutput output;
 
+    /**
+     * The function for handling context config from params
+     */
+    void initContextFromNContexts(int n_contexts);
+    void initContextFromHartConfig(const std::string& hart_config);
+
     /**
      * Trigger:
      * - Plic::post
diff --git a/src/dev/serial/uart8250.cc b/src/dev/serial/uart8250.cc
index a2f13be77d..5bcf2cac3e 100644
--- a/src/dev/serial/uart8250.cc
+++ b/src/dev/serial/uart8250.cc
@@ -57,7 +57,6 @@ Uart8250::processIntrEvent(int intrBit)
     } else {
        DPRINTF(Uart, "UART InterEvent, not interrupting\n");
     }
-
 }
 
 /* The linux serial driver (8250.c about line 1182) loops reading from
@@ -78,12 +77,26 @@ Uart8250::scheduleIntr(Event *event)
     static const Tick interval = 225 * sim_clock::as_int::ns;
     DPRINTF(Uart, "Scheduling IER interrupt for %s, at cycle %lld\n",
             event->name(), curTick() + interval);
-    if (!event->scheduled())
+    if (!event->scheduled()) {
         schedule(event, curTick() + interval);
-    else
+    } else {
         reschedule(event, curTick() + interval);
+    }
 }
 
+void
+Uart8250::clearIntr(int intrBit)
+{
+    if ((status & intrBit) == 0) {
+        return;
+    }
+    status &= ~intrBit;
+
+    // Clear interrupt at the controller if neither TX nor RX is pending now
+    if (!status) {
+        platform->clearConsoleInt();
+    }
+}
 
 Uart8250::Uart8250(const Params &p)
     : Uart(p, p.pio_size), registers(this, name() + ".registers"),
@@ -132,16 +145,21 @@ uint8_t
 Uart8250::readRbr(Register8 &reg)
 {
     uint8_t data = 0;
-    if (device->dataAvailable())
+    if (device->dataAvailable()) {
         data = device->readData();
-    else
+    } else {
         DPRINTF(Uart, "empty read of RX register\n");
+    }
 
-    status &= ~RX_INT;
-    platform->clearConsoleInt();
+    clearIntr(RX_INT);
 
-    if (device->dataAvailable() && registers.ier.get().rdi)
+    if (device->dataAvailable() && registers.ier.get().rdi) {
         scheduleIntr(&rxIntrEvent);
+    } else if (rxIntrEvent.scheduled()) {
+        // Must deschedule a future interrupt if there is no more data to read,
+        // otherwise this would confuse the kernel (iir.id=rx but lsr.rdr=0)
+        deschedule(rxIntrEvent);
+    }
 
     return data;
 }
@@ -150,10 +168,12 @@ void
 Uart8250::writeThr(Register8 &reg, const uint8_t &data)
 {
     device->writeData(data);
-    platform->clearConsoleInt();
-    status &= ~TX_INT;
-    if (registers.ier.get().thri)
+    clearIntr(TX_INT);
+    if (registers.ier.get().thri) {
         scheduleIntr(&txIntrEvent);
+    } else if (txIntrEvent.scheduled()) {
+        deschedule(txIntrEvent);
+    }
 }
 
 Uart8250::Iir
@@ -168,7 +188,10 @@ Uart8250::readIir(Register<Iir> &reg)
     } else if (status & TX_INT) {
         iir.id = (uint8_t)InterruptIds::Tx;
         // Tx interrupts are cleared on IIR reads.
-        status &= ~TX_INT;
+        if (txIntrEvent.scheduled()) {
+            deschedule(txIntrEvent);
+        }
+        clearIntr(TX_INT);
     } else {
         iir.pending = 1;
     }
@@ -193,11 +216,10 @@ Uart8250::writeIer(Register<Ier> &reg, const Ier &ier)
         }
     } else {
         DPRINTF(Uart, "IER: IER_THRI cleared, descheduling TX intrrupt\n");
-        if (txIntrEvent.scheduled())
+        if (txIntrEvent.scheduled()) {
             deschedule(txIntrEvent);
-        if (status & TX_INT)
-            platform->clearConsoleInt();
-        status &= ~TX_INT;
+        }
+        clearIntr(TX_INT);
     }
 
     if (ier.rdi && device->dataAvailable()) {
@@ -205,11 +227,10 @@ Uart8250::writeIer(Register<Ier> &reg, const Ier &ier)
         scheduleIntr(&rxIntrEvent);
     } else {
         DPRINTF(Uart, "IER: IER_RDI cleared, descheduling RX intrrupt\n");
-        if (rxIntrEvent.scheduled())
+        if (rxIntrEvent.scheduled()) {
             deschedule(rxIntrEvent);
-        if (status & RX_INT)
-            platform->clearConsoleInt();
-        status &= ~RX_INT;
+        }
+        clearIntr(RX_INT);
     }
 }
 
@@ -248,7 +269,6 @@ Uart8250::dataAvailable()
         platform->postConsoleInt();
         status |= RX_INT;
     }
-
 }
 
 AddrRangeList
@@ -267,17 +287,19 @@ Uart8250::serialize(CheckpointOut &cp) const
     paramOut(cp, "LCR", registers.lcr);
     paramOut(cp, "MCR", registers.mcr);
     Tick rxintrwhen;
-    if (rxIntrEvent.scheduled())
+    if (rxIntrEvent.scheduled()) {
         rxintrwhen = rxIntrEvent.when();
-    else
+    } else {
         rxintrwhen = 0;
+    }
     Tick txintrwhen;
-    if (txIntrEvent.scheduled())
+    if (txIntrEvent.scheduled()) {
         txintrwhen = txIntrEvent.when();
-    else
+    } else {
         txintrwhen = 0;
-     SERIALIZE_SCALAR(rxintrwhen);
-     SERIALIZE_SCALAR(txintrwhen);
+    }
+    SERIALIZE_SCALAR(rxintrwhen);
+    SERIALIZE_SCALAR(txintrwhen);
 }
 
 void
@@ -291,10 +313,12 @@ Uart8250::unserialize(CheckpointIn &cp)
     Tick txintrwhen;
     UNSERIALIZE_SCALAR(rxintrwhen);
     UNSERIALIZE_SCALAR(txintrwhen);
-    if (rxintrwhen != 0)
+    if (rxintrwhen != 0) {
         schedule(rxIntrEvent, rxintrwhen);
-    if (txintrwhen != 0)
+    }
+    if (txintrwhen != 0) {
         schedule(txIntrEvent, txintrwhen);
+    }
 }
 
 } // namespace gem5
diff --git a/src/dev/serial/uart8250.hh b/src/dev/serial/uart8250.hh
index 5774f78aab..670ee646c1 100644
--- a/src/dev/serial/uart8250.hh
+++ b/src/dev/serial/uart8250.hh
@@ -215,6 +215,7 @@ class Uart8250 : public Uart
 
     void processIntrEvent(int intrBit);
     void scheduleIntr(Event *event);
+    void clearIntr(int intrBit);
 
     EventFunctionWrapper txIntrEvent;
     EventFunctionWrapper rxIntrEvent;
diff --git a/src/dev/storage/ide_ctrl.hh b/src/dev/storage/ide_ctrl.hh
index 38b97bf3b4..dfaccc15b4 100644
--- a/src/dev/storage/ide_ctrl.hh
+++ b/src/dev/storage/ide_ctrl.hh
@@ -96,7 +96,8 @@ class IdeController : public PciDevice
         /* 0x48      */ Register8 udmaControl = {"udma control"};
         /* 0x49      */ RegisterRaz raz1 = {"raz1", 1};
         /* 0x4a-0x4b */ Register16 udmaTiming = {"udma timing"};
-        /* 0x4c-...  */ RegisterRaz raz2 = {"raz2", PCI_CONFIG_SIZE - 0x4c};
+        /* 0x4c-...  */ RegisterRaz raz2 =
+                            {"raz2", (PCI_CONFIG_SIZE + 1) - 0x4c};
 
         void serialize(CheckpointOut &cp) const;
         void unserialize(CheckpointIn &cp);
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 3c294648a9..8cb40f1c87 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -45,7 +45,7 @@ class PrefetchType(Enum):
 
 
 class GfxVersion(ScopedEnum):
-    vals = ["gfx801", "gfx803", "gfx900", "gfx902", "gfx908", "gfx90a"]
+    vals = ["gfx900", "gfx902", "gfx908", "gfx90a", "gfx942"]
 
 
 class PoolManager(SimObject):
@@ -95,6 +95,14 @@ class VectorRegisterFile(RegisterFile):
     cxx_header = "gpu-compute/vector_register_file.hh"
 
 
+class RegisterFileCache(SimObject):
+    type = "RegisterFileCache"
+    cxx_class = "gem5::RegisterFileCache"
+    cxx_header = "gpu-compute/register_file_cache.hh"
+    simd_id = Param.Int("SIMD ID associated with this Register File Cache")
+    cache_size = Param.Int(0, "number of entries of rfc")
+
+
 class RegisterManager(SimObject):
     type = "RegisterManager"
     cxx_class = "gem5::RegisterManager"
@@ -149,6 +157,11 @@ class ComputeUnit(ClockedObject):
     dpbypass_pipe_length = Param.Int(
         4, "vector ALU Double Precision bypass latency"
     )
+
+    rfc_pipe_length = Param.Int(
+        2, "number of cycles per register file cache access"
+    )
+
     scalar_pipe_length = Param.Int(1, "number of pipe stages per scalar ALU")
     issue_period = Param.Int(4, "number of cycles per issue period")
 
@@ -260,6 +273,9 @@ class ComputeUnit(ClockedObject):
     scalar_register_file = VectorParam.ScalarRegisterFile(
         "Scalar register file"
     )
+
+    register_file_cache = VectorParam.RegisterFileCache("Register file cache")
+
     out_of_order_data_delivery = Param.Bool(
         False, "enable OoO data delivery in the GM pipeline"
     )
@@ -278,6 +294,7 @@ class Shader(ClockedObject):
     dispatcher = Param.GPUDispatcher("GPU workgroup dispatcher")
     system_hub = Param.AMDGPUSystemHub(NULL, "GPU System Hub (FS Mode only)")
     n_wf = Param.Int(10, "Number of wavefront slots per SIMD")
+    cu_per_sqc = Param.Int(4, "Number of CUs that share an SQC")
     impl_kern_launch_acq = Param.Bool(
         True,
         """Insert acq packet into
@@ -304,7 +321,7 @@ class GPUComputeDriver(EmulatedDriver):
     cxx_header = "gpu-compute/gpu_compute_driver.hh"
     device = Param.GPUCommandProcessor("GPU controlled by this driver")
     isdGPU = Param.Bool(False, "Driver is for a dGPU")
-    gfxVersion = Param.GfxVersion("gfx801", "ISA of gpu to model")
+    gfxVersion = Param.GfxVersion("gfx902", "ISA of gpu to model")
     dGPUPoolID = Param.Int(0, "Pool ID for dGPU.")
     # Default Mtype for caches
     # --     1   1   1   C_RW_S  (Cached-ReadWrite-Shared)
@@ -343,6 +360,10 @@ class GPUCommandProcessor(DmaVirtDevice):
     walker = Param.VegaPagetableWalker(
         VegaPagetableWalker(), "Page table walker"
     )
+    target_non_blit_kernel_id = Param.Int(
+        0,
+        "Skip kernels until reaching this kernel (counting only non-blit kernels)",
+    )
 
 
 class StorageClassType(Enum):
diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py
index 3a44d402be..2dd7bbeabb 100644
--- a/src/gpu-compute/GPUStaticInstFlags.py
+++ b/src/gpu-compute/GPUStaticInstFlags.py
@@ -97,6 +97,8 @@ class GPUStaticInstFlags(Enum):
         # Coherence flags
         "GloballyCoherent",  # Coherent with other work-items on same device
         "SystemCoherent",  # Coherent with a different device, or the host
+        # Integer flags
+        "I8",  # Int8 operation
         # Floating-point flags
         "F16",  # F16 operation
         "F32",  # F32 operation
@@ -105,4 +107,5 @@ class GPUStaticInstFlags(Enum):
         "FMA",  # FMA
         "MAC",  # MAC
         "MAD",  # MAD
+        "MFMA",  # MFMA
     ]
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
index 81f02d8282..23e3377f50 100644
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -38,7 +38,7 @@ SimObject('GPU.py', sim_objects=[
     'PoolManager', 'SimplePoolManager', 'DynPoolManager', 'RegisterFile',
     'ScalarRegisterFile', 'VectorRegisterFile', 'RegisterManager', 'Wavefront',
     'ComputeUnit', 'Shader', 'GPUComputeDriver', 'GPURenderDriver',
-    'GPUDispatcher', 'GPUCommandProcessor'],
+    'GPUDispatcher', 'GPUCommandProcessor', 'RegisterFileCache'],
     enums=['PrefetchType', 'GfxVersion', 'StorageClassType'])
 SimObject('GPUStaticInstFlags.py', enums=['GPUStaticInstFlags'])
 SimObject('LdsState.py', sim_objects=['LdsState'])
@@ -71,6 +71,7 @@ Source('dyn_pool_manager.cc')
 Source('simple_pool_manager.cc')
 Source('static_register_manager_policy.cc')
 Source('vector_register_file.cc')
+Source('register_file_cache.cc')
 Source('wavefront.cc')
 
 DebugFlag('GPUAgentDisp')
@@ -83,6 +84,7 @@ DebugFlag('GPUExec')
 DebugFlag('GPUFetch')
 DebugFlag('GPUInst')
 DebugFlag('GPUKernelInfo')
+DebugFlag('GPULDS')
 DebugFlag('GPUMem')
 DebugFlag('GPUPort')
 DebugFlag('GPUPrefetch')
@@ -96,6 +98,7 @@ DebugFlag('GPUSRF')
 DebugFlag('GPUSync')
 DebugFlag('GPUTLB')
 DebugFlag('GPUVRF')
+DebugFlag('GPURFC')
 DebugFlag('GPUVRFSched')
 DebugFlag('GPUWgLatency')
 DebugFlag('Predictor')
@@ -103,5 +106,5 @@ DebugFlag('WavefrontStack')
 
 CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
                         'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
-                        'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo',
-                        'GPUInitAbi'])
+                        'GPUTLB', 'GPUVRF', 'GPURFC', 'GPUWgLatency',
+                        'GPUKernelInfo', 'GPUInitAbi', 'GPULDS'])
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 8d6deeb85a..807fd21d4d 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -50,6 +50,7 @@
 #include "gpu-compute/gpu_command_processor.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/simple_pool_manager.hh"
@@ -82,9 +83,11 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
           false, Event::CPU_Tick_Pri),
     cu_id(p.cu_id),
     vrf(p.vector_register_file), srf(p.scalar_register_file),
+    rfc(p.register_file_cache),
     simdWidth(p.simd_width),
     spBypassPipeLength(p.spbypass_pipe_length),
     dpBypassPipeLength(p.dpbypass_pipe_length),
+    rfcPipeLength(p.rfc_pipe_length),
     scalarPipeStages(p.scalar_pipe_length),
     operandNetworkLength(p.operand_network_length),
     issuePeriod(p.issue_period),
@@ -207,6 +210,7 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
 
     for (int i = 0; i < vrf.size(); ++i) {
         vrf[i]->setParent(this);
+        rfc[i]->setParent(this);
     }
     for (int i = 0; i < srf.size(); ++i) {
         srf[i]->setParent(this);
@@ -393,9 +397,9 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
 }
 
 /**
- * trigger invalidate operation in the cu
+ * trigger invalidate operation in the CU
  *
- * req: request initialized in shader, carrying the invlidate flags
+ * req: request initialized in shader, carrying the invalidate flags
  */
 void
 ComputeUnit::doInvalidate(RequestPtr req, int kernId){
@@ -421,6 +425,26 @@ ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
     injectGlobalMemFence(gpuDynInst, true);
 }
 
+/**
+ * trigger SQCinvalidate operation in the CU
+ *
+ * req: request initialized in shader, carrying the invalidate flags
+ */
+void
+ComputeUnit::doSQCInvalidate(RequestPtr req, int kernId){
+    GPUDynInstPtr gpuDynInst
+        = std::make_shared<GPUDynInst>(this, nullptr,
+            new KernelLaunchStaticInst(), getAndIncSeqNum());
+
+    // kern_id will be used in inv responses
+    gpuDynInst->kern_id = kernId;
+    // update contextId field
+    req->setContext(gpuDynInst->wfDynId);
+
+    gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
+    scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req);
+}
+
 // reseting SIMD register pools
 // I couldn't think of any other place and
 // I think it is needed in my implementation
@@ -840,6 +864,25 @@ ComputeUnit::DataPort::handleResponse(PacketPtr pkt)
         //  - kernel end
         //  - non-kernel mem sync
 
+        // Non-kernel mem sync not from an instruction
+        if (!gpuDynInst) {
+            // If there is no dynamic instruction, a CU must be present.
+            ComputeUnit *cu = sender_state->computeUnit;
+            assert(cu != nullptr);
+
+            if (pkt->req->isInvL2()) {
+                cu->shader->decNumOutstandingInvL2s();
+                assert(cu->shader->getNumOutstandingInvL2s() >= 0);
+            } else {
+                panic("Unknown MemSyncResp not from an instruction");
+            }
+
+            // Cleanup and return, no other response events needed.
+            delete pkt->senderState;
+            delete pkt;
+            return true;
+        }
+
         // Kernel Launch
         // wavefront was nullptr when launching kernel, so it is meaningless
         // here (simdId=-1, wfSlotId=-1)
@@ -931,6 +974,14 @@ ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
 bool
 ComputeUnit::ScalarDataPort::handleResponse(PacketPtr pkt)
 {
+    // From scalar cache invalidate that was issued at kernel start.
+    if (pkt->req->isKernel()) {
+        delete pkt->senderState;
+        delete pkt;
+
+        return true;
+    }
+
     assert(!pkt->req->isKernel());
 
     // retrieve sender state
@@ -1008,7 +1059,17 @@ ComputeUnit::DataPort::recvReqRetry()
 bool
 ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
 {
-    computeUnit->handleSQCReturn(pkt);
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    /** Process the response only if there is a wavefront associated with it.
+     * Otherwise, it is from SQC invalidate that was issued at kernel start
+     * and doesn't have a wavefront or instruction associated with it.
+     */
+    if (sender_state->wavefront != nullptr) {
+        computeUnit->handleSQCReturn(pkt);
+    } else {
+        delete pkt->senderState;
+        delete pkt;
+    }
 
     return true;
 }
@@ -1042,6 +1103,26 @@ ComputeUnit::SQCPort::recvReqRetry()
     }
 }
 
+const char*
+ComputeUnit::SQCPort::MemReqEvent::description() const
+{
+    return "ComputeUnit SQC memory request event";
+}
+
+void
+ComputeUnit::SQCPort::MemReqEvent::process()
+{
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    [[maybe_unused]] ComputeUnit *compute_unit = sqcPort.computeUnit;
+
+    assert(!pkt->req->systemReq());
+
+    if (!(sqcPort.sendTimingReq(pkt))) {
+        sqcPort.retries.push_back(std::pair<PacketPtr, Wavefront*>
+                (pkt, sender_state->wavefront));
+    }
+}
+
 void
 ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
 {
@@ -1352,6 +1433,23 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
     }
 }
 
+void
+ComputeUnit::sendInvL2(Addr paddr)
+{
+    auto req = std::make_shared<Request>(paddr, 64, 0, vramRequestorId());
+    req->setCacheCoherenceFlags(Request::GL2_CACHE_INV);
+
+    auto pkt = new Packet(req, MemCmd::MemSyncReq);
+    pkt->pushSenderState(
+       new ComputeUnit::DataPort::SenderState(this, 0, nullptr));
+
+    EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt);
+
+    schedule(mem_req_event, curTick() + req_tick_latency);
+
+    shader->incNumOutstandingInvL2s();
+}
+
 void
 ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
 {
@@ -1648,18 +1746,22 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
         SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
         compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
     } else if (!(sendTimingReq(pkt))) {
-        retries.push_back(std::make_pair(pkt, gpuDynInst));
+        retries.emplace_back(pkt, gpuDynInst);
 
-        DPRINTF(GPUPort,
-                "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
-                compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
-                id, pkt->req->getPaddr());
+        if (gpuDynInst) {
+            DPRINTF(GPUPort,
+                    "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
+                    compute_unit->cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, id, pkt->req->getPaddr());
+        }
     } else {
-        DPRINTF(GPUPort,
-                "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
-                "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
-                pkt->req->getPaddr());
+        if (gpuDynInst) {
+            DPRINTF(GPUPort,
+                    "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data"
+                    " req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
+                    pkt->req->getPaddr());
+        }
     }
 }
 
@@ -1681,7 +1783,7 @@ ComputeUnit::ScalarDataPort::MemReqEvent::process()
         SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
         compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
     } else if (!(scalarDataPort.sendTimingReq(pkt))) {
-        scalarDataPort.retries.push_back(pkt);
+        scalarDataPort.retries.emplace_back(pkt);
 
         DPRINTF(GPUPort,
                 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
@@ -2349,6 +2451,16 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent,
                "number of mad32 vec ops executed (e.g. WF size/inst)"),
       ADD_STAT(numVecOpsExecutedMAD64,
                "number of mad64 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMFMA,
+               "number of mfma vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMFMAI8,
+               "number of i8 mfma vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMFMAF16,
+               "number of f16 mfma vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMFMAF32,
+               "number of f32 mfma vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMFMAF64,
+               "number of f64 mfma vec ops executed (e.g. WF size/inst)"),
       ADD_STAT(numVecOpsExecutedTwoOpFP,
                "number of two op FP vec ops executed (e.g. WF size/inst)"),
       ADD_STAT(totalCycles, "number of cycles the CU ran for"),
@@ -2385,15 +2497,15 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent,
     instCyclesLdsPerSimd.init(cu->numVectorALUs);
 
     hitsPerTLBLevel.init(4);
-    execRateDist.init(0, 10, 2);
-    ldsBankConflictDist.init(0, cu->wfSize(), 2);
+    execRateDist.init(0, 10-1, 2);
+    ldsBankConflictDist.init(0, cu->wfSize()-1, 2);
 
     pageDivergenceDist.init(1, cu->wfSize(), 4);
     controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
     activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4);
     activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4);
 
-    headTailLatency.init(0, 1000000, 10000).flags(statistics::pdf |
+    headTailLatency.init(0, 1000000-1, 10000).flags(statistics::pdf |
         statistics::oneline);
     waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
     instInterleave.init(cu->numVectorALUs, 0, 20, 1);
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index cf73aa2723..568a81d2c5 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -66,6 +66,7 @@ class LdsChunk;
 class ScalarRegisterFile;
 class Shader;
 class VectorRegisterFile;
+class RegisterFileCache;
 
 struct ComputeUnitParams;
 
@@ -296,6 +297,8 @@ class ComputeUnit : public ClockedObject
     // array of scalar register files, one per SIMD
     std::vector<ScalarRegisterFile*> srf;
 
+    std::vector<RegisterFileCache*> rfc;
+
     // Width per VALU/SIMD unit: number of work items that can be executed
     // on the vector ALU simultaneously in a SIMD unit
     int simdWidth;
@@ -305,6 +308,8 @@ class ComputeUnit : public ClockedObject
     // number of pipe stages for bypassing data to next dependent double
     // precision vector instruction inside the vector ALU pipeline
     int dpBypassPipeLength;
+    // number of pipe stages for register file cache
+    int rfcPipeLength;
     // number of pipe stages for scalar ALU
     int scalarPipeStages;
     // number of pipe stages for operand collection & distribution network
@@ -390,6 +395,7 @@ class ComputeUnit : public ClockedObject
     int simdUnitWidth() const { return simdWidth; }
     int spBypassLength() const { return spBypassPipeLength; }
     int dpBypassLength() const { return dpBypassPipeLength; }
+    int rfcLength() const { return rfcPipeLength; }
     int scalarPipeLength() const { return scalarPipeStages; }
     int storeBusLength() const { return numCyclesPerStoreTransfer; }
     int loadBusLength() const { return numCyclesPerLoadTransfer; }
@@ -406,6 +412,7 @@ class ComputeUnit : public ClockedObject
 
     void doInvalidate(RequestPtr req, int kernId);
     void doFlush(GPUDynInstPtr gpuDynInst);
+    void doSQCInvalidate(RequestPtr req, int kernId);
 
     void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
     bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
@@ -467,6 +474,8 @@ class ComputeUnit : public ClockedObject
 
     void handleSQCReturn(PacketPtr pkt);
 
+    void sendInvL2(Addr paddr);
+
   protected:
     RequestorID _requestorId;
 
@@ -520,6 +529,7 @@ class ComputeUnit : public ClockedObject
 
         struct SenderState : public Packet::SenderState
         {
+            ComputeUnit *computeUnit = nullptr;
             GPUDynInstPtr _gpuDynInst;
             PortID port_index;
             Packet::SenderState *saved;
@@ -529,6 +539,12 @@ class ComputeUnit : public ClockedObject
                 : _gpuDynInst(gpuDynInst),
                   port_index(_port_index),
                   saved(sender_state) { }
+
+            SenderState(ComputeUnit *cu, PortID _port_index,
+                        Packet::SenderState *sender_state=nullptr)
+                : computeUnit(cu),
+                  port_index(_port_index),
+                  saved(sender_state) { }
         };
 
         class SystemHubEvent : public Event
@@ -674,6 +690,23 @@ class ComputeUnit : public ClockedObject
                 kernId(_kernId){ }
         };
 
+        class MemReqEvent : public Event
+        {
+          private:
+            SQCPort &sqcPort;
+            PacketPtr pkt;
+
+          public:
+            MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt)
+                : Event(), sqcPort(_sqc_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
         std::deque<std::pair<PacketPtr, Wavefront*>> retries;
 
       protected:
@@ -986,7 +1019,7 @@ class ComputeUnit : public ClockedObject
 
     // hold the time of the arrival of the first cache block related to
     // a particular GPUDynInst. This is used to calculate the difference
-    // between the first and last chace block arrival times.
+    // between the first and last cache block arrival times.
     std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
 
   public:
@@ -1107,6 +1140,12 @@ class ComputeUnit : public ClockedObject
         statistics::Scalar numVecOpsExecutedMAD16;
         statistics::Scalar numVecOpsExecutedMAD32;
         statistics::Scalar numVecOpsExecutedMAD64;
+        // number of individual MFMA 16,32,64 vector operations executed
+        statistics::Scalar numVecOpsExecutedMFMA;
+        statistics::Scalar numVecOpsExecutedMFMAI8;
+        statistics::Scalar numVecOpsExecutedMFMAF16;
+        statistics::Scalar numVecOpsExecutedMFMAF32;
+        statistics::Scalar numVecOpsExecutedMFMAF64;
         // total number of two op FP vector operations executed
         statistics::Scalar numVecOpsExecutedTwoOpFP;
         // Total cycles that something is running on the GPU
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index 8a72fd73f4..e3cb53a342 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -324,7 +324,7 @@ GPUDispatcher::notifyWgCompl(Wavefront *wf)
         DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
 
         if (kernelExitEvents) {
-            shader->requestKernelExitEvent();
+            shader->requestKernelExitEvent(task->completionSignal());
         }
     }
 
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc
index bcba938cd8..f2b847c8a7 100644
--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@@ -216,7 +216,7 @@ ExecStage::ExecStageStats::ExecStageStats(statistics::Group *parent)
     ComputeUnit *compute_unit = static_cast<ComputeUnit*>(parent);
 
     spc.init(0, compute_unit->numExeUnits(), 1);
-    idleDur.init(0, 75, 5);
+    idleDur.init(0, 75-1, 5);
     numCyclesWithInstrTypeIssued.init(compute_unit->numExeUnits());
     numCyclesWithNoInstrTypeIssued.init(compute_unit->numExeUnits());
 
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
index 4dadbd363d..20b89f6384 100644
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -320,7 +320,7 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
         assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
         wavefront->dropFetch = false;
     } else {
-        fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
+        fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt);
     }
 
     wavefront->pendingFetch = false;
@@ -469,8 +469,23 @@ FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
 }
 
 void
-FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
+FetchUnit::FetchBufDesc::fetchDone(PacketPtr pkt)
 {
+    // If the return command is MemSyncResp, then it belongs to
+    // an SQC invalidation request. This request calls
+    // incLGKMInstsIssued() function in its execution path.
+    // Since there is no valid memory return response associated with
+    // this instruction, decLGKMInstsIssued() is not executed. Do this
+    // here to decrement the counter and invalidate all buffers
+    if (pkt->cmd == MemCmd::MemSyncResp) {
+        wavefront->decLGKMInstsIssued();
+        flushBuf();
+        restartFromBranch = false;
+        return;
+    }
+
+    Addr vaddr = pkt->req->getVaddr();
+
     assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
     DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
             wavefront->simdId, wavefront->wfSlotId,
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh
index 0ba88c7d95..85bf2472ec 100644
--- a/src/gpu-compute/fetch_unit.hh
+++ b/src/gpu-compute/fetch_unit.hh
@@ -138,7 +138,7 @@ class FetchUnit
             return is_reserved;
         }
 
-        void fetchDone(Addr vaddr);
+        void fetchDone(PacketPtr ptr);
 
         /**
          * checks if the buffer contains valid data. this essentially
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index 05c9a95eed..2af54a262e 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -36,9 +36,12 @@
 #include "arch/amdgpu/vega/pagetable_walker.hh"
 #include "base/chunk_generator.hh"
 #include "debug/GPUCommandProc.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUInitAbi.hh"
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
 #include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/shader.hh"
 #include "mem/abstract_mem.hh"
 #include "mem/packet_access.hh"
 #include "mem/se_translating_port_proxy.hh"
@@ -47,6 +50,7 @@
 #include "sim/full_system.hh"
 #include "sim/process.hh"
 #include "sim/proxy_ptr.hh"
+#include "sim/sim_exit.hh"
 #include "sim/syscall_emul_buf.hh"
 
 namespace gem5
@@ -54,7 +58,8 @@ namespace gem5
 
 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
     : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
-      walker(p.walker), hsaPP(p.hsapp)
+      walker(p.walker), hsaPP(p.hsapp),
+      target_non_blit_kernel_id(p.target_non_blit_kernel_id)
 {
     assert(hsaPP);
     hsaPP->setDevice(this);
@@ -117,7 +122,25 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
                                        Addr host_pkt_addr)
 {
     _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
-    assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));
+    // The kernel object should be aligned to a 64B boundary, but not
+    // necessarily a cache line boundary.
+    unsigned akc_alignment_granularity = 64;
+    assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1)));
+
+    /**
+     * Make sure there is not a race condition with invalidates in the L2
+     * cache. The full system driver may write directly to memory using
+     * large BAR while the L2 cache is allowed to keep data in the valid
+     * state between kernel launches. This is a rare event but is required
+     * for correctness.
+     */
+    if (shader()->getNumOutstandingInvL2s() > 0) {
+        DPRINTF(GPUCommandProc,
+                "Deferring kernel launch due to outstanding L2 invalidates\n");
+        shader()->addDeferredDispatch(raw_pkt, queue_id, host_pkt_addr);
+
+        return;
+    }
 
     /**
      * Need to use a raw pointer for DmaVirtDevice API. This is deleted
@@ -200,7 +223,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
             // Read from GPU memory manager one cache line at a time to prevent
             // rare cases where the AKC spans two memory pages.
             ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
-                               system()->cacheLineSize());
+                               akc_alignment_granularity);
             for (; !gen.done(); gen.next()) {
                 Addr chunk_addr = gen.addr();
                 int vmid = 1;
@@ -211,10 +234,13 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
 
                 Request::Flags flags = Request::PHYSICAL;
                 RequestPtr request = std::make_shared<Request>(chunk_addr,
-                    system()->cacheLineSize(), flags,
+                    akc_alignment_granularity, flags,
                     walker->getDevRequestor());
                 Packet *readPkt = new Packet(request, MemCmd::ReadReq);
                 readPkt->dataStatic((uint8_t *)akc + gen.complete());
+                // If the request spans two device memories, the device memory
+                // returned will be null.
+                assert(system()->getDeviceMemory(readPkt) != nullptr);
                 system()->getDeviceMemory(readPkt)->access(readPkt);
                 delete readPkt;
             }
@@ -230,6 +256,8 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
 {
     _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
 
+    sanityCheckAKC(akc);
+
     DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
         "kernel object\n", akc->kernel_code_entry_byte_offset);
 
@@ -250,10 +278,13 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
      * APUs to implement asynchronous memcopy operations from 2 pointers in
      * host memory.  I have no idea what BLIT stands for.
      * */
-    if (akc->runtime_loader_kernel_symbol) {
+    bool is_blit_kernel;
+    if (!disp_pkt->completion_signal) {
         kernel_name = "Some kernel";
+        is_blit_kernel = false;
     } else {
         kernel_name = "Blit kernel";
+        is_blit_kernel = true;
     }
 
     DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
@@ -264,6 +295,38 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
         dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
         gfxVersion);
 
+    // The driver expects the start time to be in ns
+    Tick start_ts = curTick() / sim_clock::as_int::ns;
+    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
+
+    // Potentially skip a non-blit kernel
+    if (!is_blit_kernel && (non_blit_kernel_id < target_non_blit_kernel_id)) {
+        DPRINTF(GPUCommandProc, "Skipping non-blit kernel %i (Task ID: %i)\n",
+                non_blit_kernel_id, dynamic_task_id);
+
+        // Notify the HSA PP that this kernel is complete
+        hsaPacketProc().finishPkt(task->dispPktPtr(), task->queueId());
+        if (task->completionSignal()) {
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
+                    "signal! Addr: %d\n", task->completionSignal());
+
+            sendCompletionSignal(task->completionSignal());
+        } else {
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
+                "signal\n");
+        }
+
+        ++dynamic_task_id;
+        ++non_blit_kernel_id;
+
+        delete akc;
+
+        // Notify the run script that a kernel has been skipped
+        exitSimLoop("Skipping GPU Kernel");
+
+        return;
+    }
+
     DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
         "grid size (%dx%dx%d) kernarg addr: %#x, completion "
         "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
@@ -279,10 +342,7 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
 
     initABI(task);
     ++dynamic_task_id;
-
-    // The driver expects the start time to be in ns
-    Tick start_ts = curTick() / sim_clock::as_int::ns;
-    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
+    if (!is_blit_kernel) ++non_blit_kernel_id;
 
     delete akc;
 }
@@ -616,6 +676,113 @@ GPUCommandProcessor::initABI(HSAQueueEntry *task)
         sizeof(uint32_t), cb, &cb->dmaBuffer);
 }
 
+void
+GPUCommandProcessor::sanityCheckAKC(AMDKernelCode *akc)
+{
+    DPRINTF(GPUInitAbi, "group_segment_fixed_size: %d\n",
+            akc->group_segment_fixed_size);
+    DPRINTF(GPUInitAbi, "private_segment_fixed_size: %d\n",
+            akc->private_segment_fixed_size);
+    DPRINTF(GPUInitAbi, "kernarg_size: %d\n", akc->kernarg_size);
+    DPRINTF(GPUInitAbi, "kernel_code_entry_byte_offset: %d\n",
+            akc->kernel_code_entry_byte_offset);
+    DPRINTF(GPUInitAbi, "accum_offset: %d\n", akc->accum_offset);
+    DPRINTF(GPUInitAbi, "tg_split: %d\n", akc->tg_split);
+    DPRINTF(GPUInitAbi, "granulated_workitem_vgpr_count: %d\n",
+            akc->granulated_workitem_vgpr_count);
+    DPRINTF(GPUInitAbi, "granulated_wavefront_sgpr_count: %d\n",
+            akc->granulated_wavefront_sgpr_count);
+    DPRINTF(GPUInitAbi, "priority: %d\n", akc->priority);
+    DPRINTF(GPUInitAbi, "float_mode_round_32: %d\n", akc->float_mode_round_32);
+    DPRINTF(GPUInitAbi, "float_mode_round_16_64: %d\n",
+            akc->float_mode_round_16_64);
+    DPRINTF(GPUInitAbi, "float_mode_denorm_32: %d\n",
+            akc->float_mode_denorm_32);
+    DPRINTF(GPUInitAbi, "float_mode_denorm_16_64: %d\n",
+            akc->float_mode_denorm_16_64);
+    DPRINTF(GPUInitAbi, "priv: %d\n", akc->priv);
+    DPRINTF(GPUInitAbi, "enable_dx10_clamp: %d\n", akc->enable_dx10_clamp);
+    DPRINTF(GPUInitAbi, "debug_mode: %d\n", akc->debug_mode);
+    DPRINTF(GPUInitAbi, "enable_ieee_mode: %d\n", akc->enable_ieee_mode);
+    DPRINTF(GPUInitAbi, "bulky: %d\n", akc->bulky);
+    DPRINTF(GPUInitAbi, "cdbg_user: %d\n", akc->cdbg_user);
+    DPRINTF(GPUInitAbi, "fp16_ovfl: %d\n", akc->fp16_ovfl);
+    DPRINTF(GPUInitAbi, "wgp_mode: %d\n", akc->wgp_mode);
+    DPRINTF(GPUInitAbi, "mem_ordered: %d\n", akc->mem_ordered);
+    DPRINTF(GPUInitAbi, "fwd_progress: %d\n", akc->fwd_progress);
+    DPRINTF(GPUInitAbi, "enable_private_segment: %d\n",
+            akc->enable_private_segment);
+    DPRINTF(GPUInitAbi, "user_sgpr_count: %d\n", akc->user_sgpr_count);
+    DPRINTF(GPUInitAbi, "enable_trap_handler: %d\n", akc->enable_trap_handler);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_x: %d\n",
+            akc->enable_sgpr_workgroup_id_x);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_y: %d\n",
+            akc->enable_sgpr_workgroup_id_y);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_z: %d\n",
+            akc->enable_sgpr_workgroup_id_z);
+    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_info: %d\n",
+            akc->enable_sgpr_workgroup_info);
+    DPRINTF(GPUInitAbi, "enable_vgpr_workitem_id: %d\n",
+            akc->enable_vgpr_workitem_id);
+    DPRINTF(GPUInitAbi, "enable_exception_address_watch: %d\n",
+            akc->enable_exception_address_watch);
+    DPRINTF(GPUInitAbi, "enable_exception_memory: %d\n",
+            akc->enable_exception_memory);
+    DPRINTF(GPUInitAbi, "granulated_lds_size: %d\n", akc->granulated_lds_size);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_invalid_operation: %d\n",
+            akc->enable_exception_ieee_754_fp_invalid_operation);
+    DPRINTF(GPUInitAbi, "enable_exception_fp_denormal_source: %d\n",
+            akc->enable_exception_fp_denormal_source);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_division_by_zero: %d\n",
+            akc->enable_exception_ieee_754_fp_division_by_zero);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_overflow: %d\n",
+            akc->enable_exception_ieee_754_fp_overflow);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_underflow: %d\n",
+            akc->enable_exception_ieee_754_fp_underflow);
+    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_inexact: %d\n",
+            akc->enable_exception_ieee_754_fp_inexact);
+    DPRINTF(GPUInitAbi, "enable_exception_int_divide_by_zero: %d\n",
+            akc->enable_exception_int_divide_by_zero);
+    DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_buffer: %d\n",
+            akc->enable_sgpr_private_segment_buffer);
+    DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_ptr: %d\n",
+            akc->enable_sgpr_dispatch_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_queue_ptr: %d\n",
+            akc->enable_sgpr_queue_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_kernarg_segment_ptr: %d\n",
+            akc->enable_sgpr_kernarg_segment_ptr);
+    DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_id: %d\n",
+            akc->enable_sgpr_dispatch_id);
+    DPRINTF(GPUInitAbi, "enable_sgpr_flat_scratch_init: %d\n",
+            akc->enable_sgpr_flat_scratch_init);
+    DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_size: %d\n",
+            akc->enable_sgpr_private_segment_size);
+    DPRINTF(GPUInitAbi, "enable_wavefront_size32: %d\n",
+            akc->enable_wavefront_size32);
+    DPRINTF(GPUInitAbi, "use_dynamic_stack: %d\n", akc->use_dynamic_stack);
+    DPRINTF(GPUInitAbi, "kernarg_preload_spec_length: %d\n",
+            akc->kernarg_preload_spec_length);
+    DPRINTF(GPUInitAbi, "kernarg_preload_spec_offset: %d\n",
+            akc->kernarg_preload_spec_offset);
+
+
+    // Check for features not implemented in gem5
+    fatal_if(akc->wgp_mode, "WGP mode not supported\n");
+    fatal_if(akc->mem_ordered, "Memory ordering control not supported\n");
+    fatal_if(akc->fwd_progress, "Fwd_progress mode not supported\n");
+
+
+    // Warn on features that gem5 will ignore
+    warn_if(akc->fp16_ovfl, "FP16 clamp control bit ignored\n");
+    warn_if(akc->bulky, "Bulky code object bit ignored\n");
+    // TODO: All the IEEE bits
+
+    warn_if(akc->kernarg_preload_spec_length ||
+            akc->kernarg_preload_spec_offset,
+            "Kernarg preload not implemented\n");
+    warn_if(akc->tg_split, "TG split not implemented\n");
+}
+
 System*
 GPUCommandProcessor::system()
 {
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index 85b2a44494..38b5257334 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -148,12 +148,19 @@ class GPUCommandProcessor : public DmaVirtDevice
     // Typedefing dmaRead and dmaWrite function pointer
     typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
     void initABI(HSAQueueEntry *task);
+    void sanityCheckAKC(AMDKernelCode *akc);
     HSAPacketProcessor *hsaPP;
     TranslationGenPtr translate(Addr vaddr, Addr size) override;
 
     // Running counter of dispatched tasks
     int dynamic_task_id = 0;
 
+    // Running counter of dispatched user (non-blit) kernels
+    int non_blit_kernel_id = 0;
+
+    // Skip all user (non-blit) kernels until reaching this kernel
+    int target_non_blit_kernel_id = 0;
+
     // Keep track of start times for task dispatches.
     std::unordered_map<Addr, Tick> dispatchStartTime;
 
diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc
index 6c843c654f..6170abdc7b 100644
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -327,13 +327,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                  */
 
                 switch (gfxVersion) {
-                  case GfxVersion::gfx801:
-                  case GfxVersion::gfx803:
-                    args->process_apertures[i].scratch_base =
-                        scratchApeBase(i + 1);
-                    args->process_apertures[i].lds_base =
-                        ldsApeBase(i + 1);
-                    break;
                   case GfxVersion::gfx900:
                   case GfxVersion::gfx902:
                     args->process_apertures[i].scratch_base =
@@ -345,7 +338,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                     fatal("Invalid gfx version\n");
                 }
 
-                // GFX8 and GFX9 set lds and scratch limits the same way
                 args->process_apertures[i].scratch_limit =
                     scratchApeLimit(args->process_apertures[i].scratch_base);
 
@@ -353,13 +345,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                     ldsApeLimit(args->process_apertures[i].lds_base);
 
                 switch (gfxVersion) {
-                  case GfxVersion::gfx801:
-                    args->process_apertures[i].gpuvm_base =
-                        gpuVmApeBase(i + 1);
-                    args->process_apertures[i].gpuvm_limit =
-                        gpuVmApeLimit(args->process_apertures[i].gpuvm_base);
-                    break;
-                  case GfxVersion::gfx803:
                   case GfxVersion::gfx900:
                   case GfxVersion::gfx902:
                     // Taken from SVM_USE_BASE in Linux kernel
@@ -383,9 +368,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                 // id composed out of a non-zero base and an offset.
                 if (isdGPU) {
                     switch (gfxVersion) {
-                      case GfxVersion::gfx803:
-                        args->process_apertures[i].gpu_id = 50156;
-                        break;
                       case GfxVersion::gfx900:
                         args->process_apertures[i].gpu_id = 22124;
                         break;
@@ -394,7 +376,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                     }
                 } else {
                     switch (gfxVersion) {
-                      case GfxVersion::gfx801:
                       case GfxVersion::gfx902:
                         args->process_apertures[i].gpu_id = 2765;
                         break;
@@ -630,11 +611,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                     (ioc_args->kfd_process_device_apertures_ptr);
 
                 switch (gfxVersion) {
-                  case GfxVersion::gfx801:
-                  case GfxVersion::gfx803:
-                    ape_args->scratch_base = scratchApeBase(i + 1);
-                    ape_args->lds_base = ldsApeBase(i + 1);
-                    break;
                   case GfxVersion::gfx900:
                   case GfxVersion::gfx902:
                     ape_args->scratch_base = scratchApeBaseV9();
@@ -644,18 +620,11 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                     fatal("Invalid gfx version\n");
                 }
 
-                // GFX8 and GFX9 set lds and scratch limits the same way
                 ape_args->scratch_limit =
                     scratchApeLimit(ape_args->scratch_base);
                 ape_args->lds_limit = ldsApeLimit(ape_args->lds_base);
 
                 switch (gfxVersion) {
-                  case GfxVersion::gfx801:
-                    ape_args->gpuvm_base = gpuVmApeBase(i + 1);
-                    ape_args->gpuvm_limit =
-                        gpuVmApeLimit(ape_args->gpuvm_base);
-                    break;
-                  case GfxVersion::gfx803:
                   case GfxVersion::gfx900:
                   case GfxVersion::gfx902:
                     // Taken from SVM_USE_BASE in Linux kernel
@@ -670,9 +639,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                 // NOTE: Must match ID populated by hsaTopology.py
                 if (isdGPU) {
                     switch (gfxVersion) {
-                      case GfxVersion::gfx803:
-                        ape_args->gpu_id = 50156;
-                        break;
                       case GfxVersion::gfx900:
                         ape_args->gpu_id = 22124;
                         break;
@@ -681,7 +647,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
                     }
                 } else {
                     switch (gfxVersion) {
-                      case GfxVersion::gfx801:
                       case GfxVersion::gfx902:
                         ape_args->gpu_id = 2765;
                         break;
diff --git a/src/gpu-compute/gpu_compute_driver.hh b/src/gpu-compute/gpu_compute_driver.hh
index 9a3c6479c3..a455a607e2 100644
--- a/src/gpu-compute/gpu_compute_driver.hh
+++ b/src/gpu-compute/gpu_compute_driver.hh
@@ -86,8 +86,6 @@ class GPUComputeDriver final : public EmulatedDriver
     doorbellSize()
     {
         switch (gfxVersion) {
-          case GfxVersion::gfx801:
-          case GfxVersion::gfx803:
           case GfxVersion::gfx902:
             return 4;
           case GfxVersion::gfx900:
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index c59317d2c4..c4a8e9085a 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -56,7 +56,7 @@ GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
     a_data = new uint8_t[computeUnit()->wfSize() * 8];
     x_data = new uint8_t[computeUnit()->wfSize() * 8];
     // scalar loads can read up to 16 Dwords of data (see publicly
-    // available GCN3 ISA manual)
+    // available Vega ISA manual)
     scalar_data = new uint8_t[16 * sizeof(uint32_t)];
     for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {
         scalar_data[i] = 0;
@@ -725,6 +725,12 @@ GPUDynInst::isSystemCoherent() const
     return _staticInst->isSystemCoherent();
 }
 
+bool
+GPUDynInst::isI8() const
+{
+    return _staticInst->isI8();
+}
+
 bool
 GPUDynInst::isF16() const
 {
@@ -761,6 +767,12 @@ GPUDynInst::isMAD() const
     return _staticInst->isMAD();
 }
 
+bool
+GPUDynInst::isMFMA() const
+{
+    return _staticInst->isMFMA();
+}
+
 void
 GPUDynInst::doApertureCheck(const VectorMask &mask)
 {
@@ -910,35 +922,57 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
          *     #flat-addressing
          */
 
-        uint32_t numSgprs = wavefront()->maxSgprs;
-        uint32_t physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 4);
-        uint32_t offset =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 3);
-        uint32_t size =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
-            if (mask[lane]) {
-                addr[lane] = addr[lane] + lane * size + offset +
-                    wavefront()->computeUnit->shader->getHiddenPrivateBase() -
-                    wavefront()->computeUnit->shader->getScratchBase();
+        ComputeUnit *cu = wavefront()->computeUnit;
+
+        if (wavefront()->gfxVersion == GfxVersion::gfx942) {
+            // Architected flat scratch base address is in a dedicated hardware
+            // register.
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    // The scratch base is added for other gfx versions,
+                    // otherwise this would simply add the register base.
+                    addr[lane] = addr[lane] - cu->shader->getScratchBase()
+                        + wavefront()->archFlatScratchAddr;
+                }
+            }
+        } else {
+            // In absolute flat scratch the program needs to place scratch
+            // address in SGPRn-3,4.
+            uint32_t numSgprs = wavefront()->maxSgprs;
+            uint32_t physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 4);
+            uint32_t offset = cu->srf[simdId]->read(physSgprIdx);
+            physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 3);
+            uint32_t size = cu->srf[simdId]->read(physSgprIdx);
+
+
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    addr[lane] = addr[lane] + lane * size + offset +
+                        cu->shader->getHiddenPrivateBase() -
+                        cu->shader->getScratchBase();
+                }
             }
         }
-        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
-        wavefront()->decLGKMInstsIssued();
-        if (isLoad()) {
-            wavefront()->rdLmReqsInPipe--;
-        } else if (isStore()) {
-            wavefront()->wrLmReqsInPipe--;
-        } else if (isAtomic() || isMemSync()) {
-            wavefront()->wrLmReqsInPipe--;
-            wavefront()->rdLmReqsInPipe--;
-        } else {
-            panic("Invalid memory operation!\n");
+
+        wavefront()->execUnitId = wavefront()->flatLmUnitId;
+
+        // For FLAT the local memory pipe counters are incremented, but they
+        // are not incremented for explicit scratch_* instructions. Only
+        // decrement these counters if we are explicitly a FLAT instruction.
+        if (isFlat()) {
+            wavefront()->decLGKMInstsIssued();
+            if (isLoad()) {
+                wavefront()->rdLmReqsInPipe--;
+            } else if (isStore()) {
+                wavefront()->wrLmReqsInPipe--;
+            } else if (isAtomic() || isMemSync()) {
+                wavefront()->wrLmReqsInPipe--;
+                wavefront()->rdLmReqsInPipe--;
+            } else {
+                panic("Invalid memory operation!\n");
+            }
         }
     } else {
         for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index 6551fa417a..d77e77f865 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -286,6 +286,7 @@ class GPUDynInst : public GPUExecContext
     bool isGloballyCoherent() const;
     bool isSystemCoherent() const;
 
+    bool isI8() const;
     bool isF16() const;
     bool isF32() const;
     bool isF64() const;
@@ -293,6 +294,7 @@ class GPUDynInst : public GPUExecContext
     bool isFMA() const;
     bool isMAC() const;
     bool isMAD() const;
+    bool isMFMA() const;
 
     // for FLAT memory ops. check the segment address
     // against the APE registers to see if it falls
diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc
index 063e87eee1..e2dd9f54f2 100644
--- a/src/gpu-compute/gpu_static_inst.cc
+++ b/src/gpu-compute/gpu_static_inst.cc
@@ -54,55 +54,63 @@ GPUStaticInst::disassemble()
     return disassembly;
 }
 
+
+void
+GPUStaticInst::generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu,
+                                     OperandInfo& op,
+                                     std::vector<OperandInfo>& opVec,
+                                     OpType opType)
+{
+    std::vector<int> virt_idxs;
+    std::vector<int> phys_idxs;
+
+    int num_dwords = op.sizeInDWords();
+    int virt_idx = op.registerIndex(wf->reservedScalarRegs);
+
+    int phys_idx = -1;
+    for (int i = 0; i < num_dwords; i++) {
+        if (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) {
+            phys_idx = cu->registerManager->mapVgpr(wf, virt_idx + i);
+        } else {
+            assert(opType == OpType::SRC_SCALAR ||
+                   opType == OpType::DST_SCALAR);
+            phys_idx = cu->registerManager->mapSgpr(wf, virt_idx + i);
+        }
+        virt_idxs.push_back(virt_idx + i);
+        phys_idxs.push_back(phys_idx);
+    }
+    DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses "
+            "%d registers.\n", disassemble(),
+            (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ?
+            "vector" : "scalar",
+            (opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ?
+            "src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords);
+
+    op.setVirtToPhysMapping(virt_idxs, phys_idxs);
+
+    opVec.emplace_back(op);
+}
+
 void
 GPUStaticInst::initDynOperandInfo(Wavefront *wf, ComputeUnit *cu)
 {
-    // Lambda function, as this is only ever used here
-    auto generateVirtToPhysMap = [&](OperandInfo& op,
-                                     std::vector<OperandInfo>& opVec,
-                                     MapRegFn mapFn, OpType opType)
-    {
-        std::vector<int> virt_idxs;
-        std::vector<int> phys_idxs;
-
-        int num_dwords = op.sizeInDWords();
-        int virt_idx = op.registerIndex(wf->reservedScalarRegs);
-
-        int phys_idx = -1;
-        for (int i = 0; i < num_dwords; i++){
-            phys_idx = (cu->registerManager->*mapFn)(wf, virt_idx + i);
-            virt_idxs.push_back(virt_idx + i);
-            phys_idxs.push_back(phys_idx);
-        }
-        DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses "
-                "%d registers.\n", disassemble(),
-                (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ?
-                "vector" : "scalar",
-                (opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ?
-                "src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords);
-
-        op.setVirtToPhysMapping(virt_idxs, phys_idxs);
-
-        opVec.emplace_back(op);
-    };
-
     for (auto& srcOp : srcOps) {
         if (srcOp.isVectorReg()) {
-            generateVirtToPhysMap(srcOp, srcVecRegOps,
-                            &RegisterManager::mapVgpr, OpType::SRC_VEC);
+            generateVirtToPhysMap(wf, cu, srcOp, srcVecRegOps,
+                                  OpType::SRC_VEC);
         } else if (srcOp.isScalarReg()) {
-            generateVirtToPhysMap(srcOp, srcScalarRegOps,
-                            &RegisterManager::mapSgpr, OpType::SRC_SCALAR);
+            generateVirtToPhysMap(wf, cu, srcOp, srcScalarRegOps,
+                                  OpType::SRC_SCALAR);
         }
     }
 
     for (auto& dstOp : dstOps) {
         if (dstOp.isVectorReg()) {
-            generateVirtToPhysMap(dstOp, dstVecRegOps,
-                            &RegisterManager::mapVgpr, OpType::DST_VEC);
+            generateVirtToPhysMap(wf, cu, dstOp, dstVecRegOps,
+                                  OpType::DST_VEC);
         } else if (dstOp.isScalarReg()) {
-            generateVirtToPhysMap(dstOp, dstScalarRegOps,
-                            &RegisterManager::mapSgpr, OpType::DST_SCALAR);
+            generateVirtToPhysMap(wf, cu, dstOp, dstScalarRegOps,
+                                  OpType::DST_SCALAR);
         }
     }
 }
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
index 156f0e529d..f8b6394d6f 100644
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -179,7 +179,8 @@ class GPUStaticInst : public GPUStaticInstFlags
     {
         return _flags[MemoryRef] && (_flags[GlobalSegment] ||
                _flags[PrivateSegment] || _flags[ReadOnlySegment] ||
-               _flags[SpillSegment] || _flags[FlatGlobal]);
+               _flags[SpillSegment] || _flags[FlatGlobal] ||
+               _flags[FlatScratch]);
     }
 
     bool
@@ -210,6 +211,7 @@ class GPUStaticInst : public GPUStaticInstFlags
     bool isSystemCoherent() const { return _flags[SystemCoherent]; }
 
     // Floating-point instructions
+    bool isI8() const { return _flags[I8]; }
     bool isF16() const { return _flags[F16]; }
     bool isF32() const { return _flags[F32]; }
     bool isF64() const { return _flags[F64]; }
@@ -218,6 +220,7 @@ class GPUStaticInst : public GPUStaticInstFlags
     bool isFMA() const { return _flags[FMA]; }
     bool isMAC() const { return _flags[MAC]; }
     bool isMAD() const { return _flags[MAD]; }
+    bool isMFMA() const { return _flags[MFMA]; }
 
     virtual int instSize() const = 0;
 
@@ -321,6 +324,9 @@ class GPUStaticInst : public GPUStaticInstFlags
     int _ipdInstNum;
 
     std::bitset<Num_Flags> _flags;
+
+    void generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu, OperandInfo& op,
+                               std::vector<OperandInfo>& opVec, OpType opType);
 };
 
 class KernelLaunchStaticInst : public GPUStaticInst
diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh
index 84ae139127..44de1a8d32 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -63,7 +63,7 @@ class HSAQueueEntry
     HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
                   int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
                   Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version)
-        : kernName(kernel_name),
+        : _gfxVersion(gfx_version), kernName(kernel_name),
           _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
                   (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
                   (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
@@ -94,25 +94,22 @@ class HSAQueueEntry
         // LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
         //     #code-object-v3-kernel-descriptor
         //
-        // Currently, the only supported gfx version in gem5 that computes
-        // VGPR count differently is gfx90a.
-        if (gfx_version == GfxVersion::gfx90a) {
+        // Currently, the only supported gfx versions in gem5 that compute
+        // VGPR count differently are gfx90a and gfx942.
+        if (gfx_version == GfxVersion::gfx90a ||
+            gfx_version == GfxVersion::gfx942) {
             numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
         } else {
             numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
         }
 
-        // SGPR allocation granularies:
-        // - GFX8: 8
-        // - GFX9: 16
-        // Source: https://llvm.org/docs/.html
-        if (gfx_version == GfxVersion::gfx801 ||
-                gfx_version == GfxVersion::gfx803) {
-            numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
-        } else if (gfx_version == GfxVersion::gfx900 ||
+        // SGPR allocation granulary is 16 in GFX9
+        // Source: https://llvm.org/docs/AMDGPUUsage.html
+        if (gfx_version == GfxVersion::gfx900 ||
                 gfx_version == GfxVersion::gfx902 ||
                 gfx_version == GfxVersion::gfx908 ||
-                gfx_version == GfxVersion::gfx90a) {
+                gfx_version == GfxVersion::gfx90a ||
+                gfx_version == GfxVersion::gfx942) {
             numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
         } else {
             panic("Saw unknown gfx version setting up GPR counts\n");
@@ -127,6 +124,17 @@ class HSAQueueEntry
         }
 
         parseKernelCode(akc);
+
+        // Offset of a first AccVGPR in the unified register file.
+        // Granularity 4. Value 0-63. 0 - accum-offset = 4,
+        // 1 - accum-offset = 8, ..., 63 - accum-offset = 256.
+        _accumOffset = (akc->accum_offset + 1) * 4;
+    }
+
+    const GfxVersion&
+    gfxVersion() const
+    {
+        return _gfxVersion;
     }
 
     const std::string&
@@ -393,6 +401,12 @@ class HSAQueueEntry
         assert(_outstandingWbs >= 0);
     }
 
+    unsigned
+    accumOffset() const
+    {
+        return _accumOffset;
+    }
+
   private:
     void
     parseKernelCode(AMDKernelCode *akc)
@@ -412,12 +426,6 @@ class HSAQueueEntry
             akc->enable_sgpr_flat_scratch_init);
         initialSgprState.set(PrivateSegSize,
             akc->enable_sgpr_private_segment_size);
-        initialSgprState.set(GridWorkgroupCountX,
-            akc->enable_sgpr_grid_workgroup_count_x);
-        initialSgprState.set(GridWorkgroupCountY,
-            akc->enable_sgpr_grid_workgroup_count_y);
-        initialSgprState.set(GridWorkgroupCountZ,
-            akc->enable_sgpr_grid_workgroup_count_z);
         initialSgprState.set(WorkgroupIdX,
             akc->enable_sgpr_workgroup_id_x);
         initialSgprState.set(WorkgroupIdY,
@@ -427,7 +435,7 @@ class HSAQueueEntry
         initialSgprState.set(WorkgroupInfo,
             akc->enable_sgpr_workgroup_info);
         initialSgprState.set(PrivSegWaveByteOffset,
-            akc->enable_sgpr_private_segment_wave_byte_offset);
+            akc->enable_private_segment);
 
         /**
          * set the enable bits for the initial VGPR state. the
@@ -438,6 +446,8 @@ class HSAQueueEntry
         initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id > 1);
     }
 
+    // store gfx version for version specific task handling
+    GfxVersion _gfxVersion;
     // name of the kernel associated with the AQL entry
     std::string kernName;
     // workgroup Size (3 dimensions)
@@ -492,6 +502,8 @@ class HSAQueueEntry
 
     std::bitset<NumVectorInitFields> initialVgprState;
     std::bitset<NumScalarInitFields> initialSgprState;
+
+    unsigned _accumOffset;
 };
 
 } // namespace gem5
diff --git a/src/gpu-compute/kernel_code.hh b/src/gpu-compute/kernel_code.hh
index 1879dee672..c230af0fad 100644
--- a/src/gpu-compute/kernel_code.hh
+++ b/src/gpu-compute/kernel_code.hh
@@ -60,15 +60,12 @@ enum ScalarRegInitFields : int
     DispatchId = 4,
     FlatScratchInit = 5,
     PrivateSegSize = 6,
-    GridWorkgroupCountX = 7,
-    GridWorkgroupCountY = 8,
-    GridWorkgroupCountZ = 9,
-    WorkgroupIdX = 10,
-    WorkgroupIdY = 11,
-    WorkgroupIdZ = 12,
-    WorkgroupInfo = 13,
-    PrivSegWaveByteOffset = 14,
-    NumScalarInitFields = 15
+    WorkgroupIdX = 7,
+    WorkgroupIdY = 8,
+    WorkgroupIdZ = 9,
+    WorkgroupInfo = 10,
+    PrivSegWaveByteOffset = 11,
+    NumScalarInitFields = 12
 };
 
 enum VectorRegInitFields : int
@@ -79,28 +76,24 @@ enum VectorRegInitFields : int
     NumVectorInitFields = 3
 };
 
-struct AMDKernelCode
+// Kernel code object based on the table on LLVM's website:
+// https://llvm.org/docs/AMDGPUUsage.html#code-object-v3-kernel-descriptor
+typedef struct GEM5_PACKED
 {
-    uint32_t amd_kernel_code_version_major;
-    uint32_t amd_kernel_code_version_minor;
-    uint16_t amd_machine_kind;
-    uint16_t amd_machine_version_major;
-    uint16_t amd_machine_version_minor;
-    uint16_t amd_machine_version_stepping;
+    uint32_t group_segment_fixed_size;
+    uint32_t private_segment_fixed_size;
+    uint32_t kernarg_size;
+    uint8_t reserved0[4];
     int64_t kernel_code_entry_byte_offset;
-    int64_t kernel_code_prefetch_byte_offset;
-    uint64_t kernel_code_prefetch_byte_size;
-    uint64_t max_scratch_backing_memory_byte_size;
+    uint8_t reserved1[20];
 
-    /**
-     * The fields below are used to set program settings for
-     * compute shaders. Here they are primarily used to setup
-     * initial register state. See the following for full details
-     * about kernel launch, state initialization, and the AMD kernel
-     * code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/
-     *              blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
-     *              #initial-kernel-register-state
-     */
+    // the 32b below here represent the fields of
+    // the COMPUTE_PGM_RSRC3 register for GFX90A, GFX940
+    uint32_t accum_offset : 6;
+    uint32_t compute_pgm_rsrc3_reserved1 : 10;
+    uint32_t tg_split : 1;
+    uint32_t compute_pgm_rsrc3_reserved2 : 15;
+    // end COMPUTE_PGM_RSRC3 register
 
     // the 32b below here represent the fields of
     // the COMPUTE_PGM_RSRC1 register
@@ -117,12 +110,16 @@ struct AMDKernelCode
     uint32_t enable_ieee_mode : 1;
     uint32_t bulky : 1;
     uint32_t cdbg_user : 1;
-    uint32_t compute_pgm_rsrc1_reserved : 6;
+    uint32_t fp16_ovfl : 1;
+    uint32_t compute_pgm_rsrc1_reserved : 2;
+    uint32_t wgp_mode : 1;
+    uint32_t mem_ordered : 1;
+    uint32_t fwd_progress : 1;
     // end COMPUTE_PGM_RSRC1 register
 
     // the 32b below here represent the fields of
     // the COMPUTE_PGM_RSRC2 register
-    uint32_t enable_sgpr_private_segment_wave_byte_offset : 1;
+    uint32_t enable_private_segment : 1;
     uint32_t user_sgpr_count : 5;
     uint32_t enable_trap_handler : 1;
     uint32_t enable_sgpr_workgroup_id_x : 1;
@@ -131,7 +128,7 @@ struct AMDKernelCode
     uint32_t enable_sgpr_workgroup_info : 1;
     uint32_t enable_vgpr_workitem_id : 2;
     uint32_t enable_exception_address_watch : 1;
-    uint32_t enable_exception_memory_violation : 1;
+    uint32_t enable_exception_memory : 1;
     uint32_t granulated_lds_size : 9;
     uint32_t enable_exception_ieee_754_fp_invalid_operation : 1;
     uint32_t enable_exception_fp_denormal_source : 1;
@@ -152,41 +149,17 @@ struct AMDKernelCode
     uint32_t enable_sgpr_dispatch_id : 1;
     uint32_t enable_sgpr_flat_scratch_init : 1;
     uint32_t enable_sgpr_private_segment_size : 1;
-    uint32_t enable_sgpr_grid_workgroup_count_x : 1;
-    uint32_t enable_sgpr_grid_workgroup_count_y : 1;
-    uint32_t enable_sgpr_grid_workgroup_count_z : 1;
-    uint32_t kernel_code_properties_reserved1 : 6;
-    uint32_t enable_ordered_append_gds : 1;
-    uint32_t private_element_size : 2;
-    uint32_t is_ptr64 : 1;
-    uint32_t is_dynamic_callstack : 1;
-    uint32_t is_debug_enabled : 1;
-    uint32_t is_xnack_enabled : 1;
-    uint32_t kernel_code_properties_reserved2 : 9;
+    uint32_t kernel_code_properties_reserved1 : 3;
+    uint32_t enable_wavefront_size32 : 1;
+    uint32_t use_dynamic_stack : 1;
+    uint32_t kernel_code_properties_reserved2 : 4;
     // end KERNEL_CODE_PROPERTIES
 
-    uint32_t workitem_private_segment_byte_size;
-    uint32_t workgroup_group_segment_byte_size;
-    uint32_t gds_segment_byte_size;
-    uint64_t kernarg_segment_byte_size;
-    uint32_t workgroup_fbarrier_count;
-    uint16_t wavefront_sgpr_count;
-    uint16_t workitem_vgpr_count;
-    uint16_t reserved_vgpr_first;
-    uint16_t reserved_vgpr_count;
-    uint16_t reserved_sgpr_first;
-    uint16_t reserved_sgpr_count;
-    uint16_t debug_wavefront_private_segment_offset_sgpr;
-    uint16_t debug_private_segment_buffer_sgpr;
-    uint8_t kernarg_segment_alignment;
-    uint8_t group_segment_alignment;
-    uint8_t private_segment_alignment;
-    uint8_t wavefront_size;
-    int32_t call_convention;
-    uint8_t reserved[12];
-    uint64_t runtime_loader_kernel_symbol;
-    uint64_t control_directives[16];
-};
+    uint32_t kernarg_preload_spec_length : 7;
+    uint32_t kernarg_preload_spec_offset : 9;
+    uint8_t reserved2[4];
+} AMDKernelCode;
+static_assert(sizeof(AMDKernelCode) == 64);
 
 } // namespace gem5
 
diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh
index 3228b7822c..d336d35079 100644
--- a/src/gpu-compute/lds_state.hh
+++ b/src/gpu-compute/lds_state.hh
@@ -39,6 +39,7 @@
 #include <utility>
 #include <vector>
 
+#include "debug/GPULDS.hh"
 #include "gpu-compute/misc.hh"
 #include "mem/port.hh"
 #include "params/LdsState.hh"
@@ -75,10 +76,30 @@ class LdsChunk
          * chunk allocated to this WG we return 0.
          */
         if (index >= chunk.size()) {
+            DPRINTF(GPULDS, "LDS[%d][%d]: Read 0 beyond size (%ld)\n",
+                    dispatchId, wgId, chunk.size());
             return (T)0;
         }
 
         T *p0 = (T *) (&(chunk.at(index)));
+
+        if (sizeof(T) <= 4) {
+            [[maybe_unused]] uint32_t int_val =
+                *reinterpret_cast<uint32_t*>(p0);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Read %08x from index %d\n",
+                    dispatchId, wgId, int_val, index);
+        } else if (sizeof(T) <= 8) {
+            [[maybe_unused]] uint64_t int_val =
+                *reinterpret_cast<uint64_t*>(p0);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Read %016lx from index %d\n",
+                    dispatchId, wgId, int_val, index);
+        } else if (sizeof(T) <= 16) {
+            [[maybe_unused]] uint64_t *int_vals =
+                reinterpret_cast<uint64_t*>(p0);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Read %016lx%016lx from index %d\n",
+                    dispatchId, wgId, int_vals[1], int_vals[0], index);
+        }
+
         return *p0;
     }
 
@@ -94,10 +115,33 @@ class LdsChunk
          * chunk allocated to this WG are dropped.
          */
         if (index >= chunk.size()) {
+            DPRINTF(GPULDS, "LDS[%d][%d]: Ignoring write beyond size (%ld)\n",
+                    dispatchId, wgId, chunk.size());
             return;
         }
 
         T *p0 = (T *) (&(chunk.at(index)));
+
+        if (sizeof(T) <= 4) {
+            [[maybe_unused]] uint32_t prev_val =
+                *reinterpret_cast<uint32_t*>(p0);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Write %08lx to index %d (was "
+                    "%08lx)\n", dispatchId, wgId, value, index, prev_val);
+        } else if (sizeof(T) <= 8) {
+            [[maybe_unused]] uint64_t prev_val =
+                *reinterpret_cast<uint64_t*>(p0);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Write %016lx to index %d (was "
+                    "%016lx)\n", dispatchId, wgId, value, index, prev_val);
+        } else if (sizeof(T) <= 16) {
+            [[maybe_unused]] uint64_t *prev_vals =
+                reinterpret_cast<uint64_t*>(p0);
+            [[maybe_unused]] const uint64_t *next_vals =
+                reinterpret_cast<const uint64_t*>(&value);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Write %016lx%016lx to index %d "
+                    "(was %016lx%016lx)\n", dispatchId, wgId, next_vals[1],
+                    next_vals[0], index, prev_vals[1], prev_vals[0]);
+        }
+
         *p0 = value;
     }
 
@@ -131,6 +175,9 @@ class LdsChunk
         return chunk.size();
     }
 
+    uint32_t dispatchId;
+    uint32_t wgId;
+
   protected:
     // the actual data store for this slice of the LDS
     std::vector<uint8_t> chunk;
@@ -402,6 +449,9 @@ class LdsState: public ClockedObject
             // make an entry for this workgroup
             refCounter[dispatchId][wgId] = 0;
 
+            chunkMap[dispatchId][wgId].dispatchId = dispatchId;
+            chunkMap[dispatchId][wgId].wgId = wgId;
+
             return &chunkMap[dispatchId][wgId];
         }
     }
diff --git a/src/gpu-compute/register_file.cc b/src/gpu-compute/register_file.cc
index 62510e47b7..ec35c01528 100644
--- a/src/gpu-compute/register_file.cc
+++ b/src/gpu-compute/register_file.cc
@@ -194,6 +194,10 @@ RegisterFile::RegisterFileStats::RegisterFileStats(statistics::Group *parent)
     : statistics::Group(parent),
       ADD_STAT(registerReads,
               "Total number of DWORDs read from register file"),
+      ADD_STAT(rfc_cache_read_hits,
+              "Total number of DWORDs read from register file cache"),
+      ADD_STAT(rfc_cache_write_hits,
+              "Total number of writes to existing registers in the rfc"),
       ADD_STAT(registerWrites,
               "Total number of DWORDS written to register file"),
       ADD_STAT(sramReads,
diff --git a/src/gpu-compute/register_file.hh b/src/gpu-compute/register_file.hh
index 089516ce27..0d1a12b7b7 100644
--- a/src/gpu-compute/register_file.hh
+++ b/src/gpu-compute/register_file.hh
@@ -158,6 +158,10 @@ class RegisterFile : public SimObject
 
         // Total number of register reads per DWORD per thread
         statistics::Scalar registerReads;
+
+        statistics::Scalar rfc_cache_read_hits;
+        statistics::Scalar rfc_cache_write_hits;
+
         // Total number of register writes per DWORD per thread
         statistics::Scalar registerWrites;
 
diff --git a/src/gpu-compute/register_file_cache.cc b/src/gpu-compute/register_file_cache.cc
new file mode 100644
index 0000000000..f23cf399da
--- /dev/null
+++ b/src/gpu-compute/register_file_cache.cc
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2023 The University of Wisconsin
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "gpu-compute/register_file_cache.hh"
+
+#include <sstream>
+#include <string>
+
+#include "base/intmath.hh"
+#include "base/logging.hh"
+#include "debug/GPURFC.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/RegisterFileCache.hh"
+
+namespace gem5
+{
+
+RegisterFileCache::RegisterFileCache(const RegisterFileCacheParams &p)
+    : SimObject(p), simdId(p.simd_id), _capacity(p.cache_size)
+{
+    fatal_if(simdId < 0, "Illegal SIMD id for rfc");
+}
+
+RegisterFileCache::~RegisterFileCache()
+{
+}
+
+void
+RegisterFileCache::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+}
+
+bool
+RegisterFileCache::inRFC(int regIdx)
+{
+    return (lruHash.find(regIdx) != lruHash.end());
+}
+
+std::string
+RegisterFileCache::dumpLL() const
+{
+    std::stringstream ss;
+    ss << "lru_order: ";
+    for (auto i=lruHead; i!=nullptr; i=i->next) {
+        if (i->prev == nullptr) {
+            ss << "reg: " << i->regIdx << " ";
+        } else {
+            ss << "reg: " << i->regIdx << " (prev: " << i->prev->regIdx<<") ";
+        }
+        if (i->next != nullptr) {
+            ss << " (next: " << i->next->regIdx<<") ";
+        }
+    }
+    ss << "\n";
+    return ss.str();
+}
+
+void
+RegisterFileCache::markRFC(int regIdx)
+{
+    if (_capacity == 0) {
+        return;
+    }
+    if (lruHash.find(regIdx) == lruHash.end()) {
+        if (lruHead == nullptr) {
+            DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n",
+                simdId, regIdx);
+            OrderedRegs *oreg = new OrderedRegs(regIdx);
+            lruHash[regIdx] = oreg;
+            lruHead = oreg;
+            lruTail = oreg;
+            return;
+        }
+
+        if (lruHash.size() >= _capacity) {
+            int val = lruTail->regIdx;
+            DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting "
+                "physReg[%d] evicting physReg[%d]\n", simdId, regIdx, val);
+
+            lruTail = lruTail->prev;
+            lruTail->next = nullptr;
+            lruHash.erase(val);
+        } else {
+            DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n",
+                simdId, regIdx);
+        }
+    } else { // Exists in cache need to update
+        DPRINTF(GPURFC, "RFC SIMD[%d] cache hit physReg[%d]\n",
+            simdId, regIdx);
+
+        if (lruHead->regIdx == regIdx) {
+            return;
+        }
+        if (lruHash[regIdx]==lruTail) {
+            lruTail = lruHash[regIdx]->prev;
+        }
+        if (lruHash[regIdx]->next != nullptr) {
+            lruHash[regIdx]->next->prev = lruHash[regIdx]->prev;
+        }
+        lruHash[regIdx]->prev->next = lruHash[regIdx]->next;
+        lruHash.erase(regIdx);
+    }
+
+    OrderedRegs *oreg = new OrderedRegs(regIdx);
+    lruHash[regIdx] = oreg;
+    oreg->next = lruHead;
+    lruHead->prev = oreg;
+    lruHead = oreg;
+}
+
+void
+RegisterFileCache::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
+{
+    if (!ii->isLoad()
+        && !(ii->isAtomic() || ii->isMemSync())) {
+        Cycles delay(computeUnit->rfcLength());
+        Tick tickDelay = computeUnit->cyclesToTicks(delay);
+
+        for (const auto& dstVecOp : ii->dstVecRegOperands()) {
+            for (const auto& physIdx : dstVecOp.physIndices()) {
+                enqCacheInsertEvent(physIdx, tickDelay);
+            }
+        }
+    }
+}
+
+void
+RegisterFileCache::enqCacheInsertEvent(uint32_t regIdx, uint64_t delay)
+{
+    schedule(new MarkRegCachedEvent(this, regIdx),
+                curTick() + delay);
+}
+
+void
+RegisterFileCache::MarkRegCachedEvent::process()
+{
+    rfc->markRFC(regIdx);
+}
+
+}
diff --git a/src/gpu-compute/register_file_cache.hh b/src/gpu-compute/register_file_cache.hh
new file mode 100644
index 0000000000..040f174033
--- /dev/null
+++ b/src/gpu-compute/register_file_cache.hh
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2023 The University of Wisconsin
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __REGISTER_FILE_CACHE_HH__
+#define __REGISTER_FILE_CACHE_HH__
+
+#include <limits>
+#include <unordered_set>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "gpu-compute/misc.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+class ComputeUnit;
+class Wavefront;
+
+struct RegisterFileCacheParams;
+
+class RegisterFileCache : public SimObject
+{
+  public:
+    RegisterFileCache(const RegisterFileCacheParams &p);
+    virtual ~RegisterFileCache();
+    virtual void setParent(ComputeUnit *_computeUnit);
+    int cacheSize() const { return _capacity; }
+
+    // Debug functions
+    virtual std::string dumpLL() const;
+
+    // Abstract Register Event
+    class RegisterCacheEvent : public Event
+    {
+      protected:
+        RegisterFileCache *rfc;
+        int regIdx;
+
+      public:
+        RegisterCacheEvent(RegisterFileCache *rfc, int regIdx)
+            : rfc(rfc), regIdx(regIdx) { setFlags(AutoDelete); }
+    };
+
+    class MarkRegCachedEvent : public RegisterCacheEvent
+    {
+      public:
+        MarkRegCachedEvent(RegisterFileCache *rfc, int regIdx)
+            : RegisterCacheEvent(rfc, regIdx) { }
+        void process();
+    };
+
+    virtual void enqCacheInsertEvent(uint32_t regIdx, uint64_t delay);
+
+    virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii);
+
+    // Add register to rfc using LRU replacement policy
+    virtual void markRFC(int regIdx);
+
+    virtual bool inRFC(int regIdx);
+
+  protected:
+    ComputeUnit* computeUnit;
+    int simdId, _capacity;
+
+    class OrderedRegs
+    {
+      public:
+        int regIdx;
+
+        OrderedRegs* next;
+        OrderedRegs* prev;
+        OrderedRegs(int val) : regIdx(val), next(nullptr), prev(nullptr) {}
+    };
+
+    // Doubly linked list, head is the most recently used
+    std::unordered_map<int, OrderedRegs*> lruHash;
+    OrderedRegs* lruHead = nullptr;
+    OrderedRegs* lruTail = nullptr;
+
+};
+
+} // namespace gem5
+
+#endif // __REGISTER_FILE_CACHE_HH__
diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc
index de24f9448b..cd8dd30b00 100644
--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -160,4 +160,87 @@ ScalarMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
     issuedRequests.push(gpuDynInst);
 }
 
+void
+ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
+                                        bool kernelMemSync,
+                                        RequestPtr req)
+{
+    assert(gpuDynInst->isScalar());
+
+    if (!req) {
+        req = std::make_shared<Request>(
+                0, 0, 0, computeUnit.requestorId(), 0, gpuDynInst->wfDynId);
+    } else {
+        req->requestorId(computeUnit.requestorId());
+    }
+
+    // When the SQC invalidate instruction is executed, it calls
+    // injectScalarMemFence. The instruction does not contain an address
+    // as one of its operands. Therefore, set the physical address of the
+    // invalidation request to 0 and handle it in the sequencer
+    req->setPaddr(0);
+
+    PacketPtr sqc_pkt = nullptr;
+
+    // If kernelMemSync is true, then the invalidation request is from
+    // kernel launch and is an implicit invalidation.If false, then it is
+    // due to an S_ICACHE_INV instruction
+    if (kernelMemSync) {
+        req->setCacheCoherenceFlags(Request::INV_L1);
+        req->setReqInstSeqNum(gpuDynInst->seqNum());
+        req->setFlags(Request::KERNEL);
+        sqc_pkt = new Packet(req, MemCmd::MemSyncReq);
+        sqc_pkt->pushSenderState(
+                new ComputeUnit::SQCPort::SenderState(
+                    gpuDynInst->wavefront(), nullptr));
+    } else {
+        gpuDynInst->setRequestFlags(req);
+
+        req->setReqInstSeqNum(gpuDynInst->seqNum());
+
+        sqc_pkt = new Packet(req, MemCmd::MemSyncReq);
+        sqc_pkt->pushSenderState(
+                new ComputeUnit::SQCPort::SenderState(
+                    gpuDynInst->wavefront(), nullptr));
+    }
+
+    ComputeUnit::SQCPort::MemReqEvent *sqc_event =
+            new ComputeUnit::SQCPort::MemReqEvent
+            (computeUnit.sqcPort, sqc_pkt);
+    computeUnit.schedule(
+            sqc_event, curTick() + computeUnit.scalar_req_tick_latency);
+
+    // When the SQC is invalidated, perform a scalar cache invalidate as well.
+    // The SQC and Scalar cache are implement using the same SLICC SM, so this
+    // invalidate is identical to the SQC invalidate, however we need to make
+    // a new packet and request as they have different cache destinations.
+    PacketPtr scalar_pkt = nullptr;
+    RequestPtr scalar_req(req);
+
+    if (kernelMemSync) {
+        scalar_req->setCacheCoherenceFlags(Request::INV_L1);
+        scalar_req->setReqInstSeqNum(gpuDynInst->seqNum());
+        scalar_req->setFlags(Request::KERNEL);
+        scalar_pkt = new Packet(scalar_req, MemCmd::MemSyncReq);
+        scalar_pkt->pushSenderState(
+                new ComputeUnit::ScalarDataPort::SenderState(
+                    gpuDynInst));
+    } else {
+        gpuDynInst->setRequestFlags(scalar_req);
+
+        scalar_req->setReqInstSeqNum(gpuDynInst->seqNum());
+
+        scalar_pkt = new Packet(scalar_req, MemCmd::MemSyncReq);
+        scalar_pkt->pushSenderState(
+                new ComputeUnit::ScalarDataPort::SenderState(
+                    gpuDynInst));
+    }
+
+    ComputeUnit::ScalarDataPort::MemReqEvent *scalar_event =
+            new ComputeUnit::ScalarDataPort::MemReqEvent
+            (computeUnit.scalarDataPort, scalar_pkt);
+    computeUnit.schedule(
+            scalar_event, curTick() + computeUnit.scalar_req_tick_latency);
+}
+
 } // namespace gem5
diff --git a/src/gpu-compute/scalar_memory_pipeline.hh b/src/gpu-compute/scalar_memory_pipeline.hh
index 5512c7c01f..e5dc7b4292 100644
--- a/src/gpu-compute/scalar_memory_pipeline.hh
+++ b/src/gpu-compute/scalar_memory_pipeline.hh
@@ -36,6 +36,7 @@
 #include <string>
 
 #include "gpu-compute/misc.hh"
+#include "mem/request.hh"
 #include "params/ComputeUnit.hh"
 #include "sim/stats.hh"
 
@@ -67,6 +68,9 @@ class ScalarMemPipeline
 
     void issueRequest(GPUDynInstPtr gpuDynInst);
 
+    void injectScalarMemFence(
+            GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req);
+
     bool
     isGMLdRespFIFOWrRdy() const
     {
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc
index 0d475c577e..af9ce538e3 100644
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -38,6 +38,7 @@
 #include "debug/GPUVRF.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
@@ -625,8 +626,6 @@ void
 ScheduleStage::arbitrateVrfToLdsBus()
 {
     // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
-    // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
-    // and a VRF->LDS bus. In GFx9, this is not the case.
 
     // iterate the GM pipelines
     for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index e13e7c9cf4..13b03b0a34 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -64,6 +64,7 @@ Shader::Shader(const Params &p) : ClockedObject(p),
     impl_kern_end_rel(p.impl_kern_end_rel),
     coissue_return(1),
     trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
+    n_cu_per_sqc(p.cu_per_sqc),
     globalMemSize(p.globalmem),
     nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
     _dispatcher(*p.dispatcher), systemHub(p.system_hub),
@@ -221,6 +222,13 @@ Shader::prepareInvalidate(HSAQueueEntry *task) {
         // all necessary INV flags are all set now, call cu to execute
         cuList[i_cu]->doInvalidate(req, task->dispatchId());
 
+
+        // A set of CUs share a single SQC cache. Send a single invalidate
+        // request to each SQC
+        if ((i_cu % n_cu_per_sqc) == 0) {
+            cuList[i_cu]->doSQCInvalidate(req, task->dispatchId());
+        }
+
         // I don't like this. This is intrusive coding.
         cuList[i_cu]->resetRegisterPool();
     }
@@ -535,11 +543,38 @@ Shader::notifyCuSleep() {
 
         if (kernelExitRequested) {
             kernelExitRequested = false;
-            exitSimLoop("GPU Kernel Completed");
+            if (blitKernel) {
+                exitSimLoop("GPU Blit Kernel Completed");
+            } else {
+                exitSimLoop("GPU Kernel Completed");
+            }
         }
     }
 }
 
+void
+Shader::decNumOutstandingInvL2s()
+{
+    num_outstanding_invl2s--;
+
+    if (num_outstanding_invl2s == 0 && !deferred_dispatches.empty()) {
+        for (auto &dispatch : deferred_dispatches) {
+            gpuCmdProc.submitDispatchPkt(std::get<0>(dispatch),
+                                         std::get<1>(dispatch),
+                                         std::get<2>(dispatch));
+        }
+        deferred_dispatches.clear();
+    }
+}
+
+void
+Shader::addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
+                            Addr host_pkt_addr)
+{
+    deferred_dispatches.push_back(
+            std::make_tuple(raw_pkt, queue_id, host_pkt_addr));
+}
+
 /**
  * Forward the VRAM requestor ID needed for device memory from CP.
  */
@@ -572,31 +607,31 @@ Shader::ShaderStats::ShaderStats(statistics::Group *parent, int wf_size)
                "vector instruction destination operand distribution")
 {
     allLatencyDist
-        .init(0, 1600000, 10000)
+        .init(0, 1600000-1, 10000)
         .flags(statistics::pdf | statistics::oneline);
 
     loadLatencyDist
-        .init(0, 1600000, 10000)
+        .init(0, 1600000-1, 10000)
         .flags(statistics::pdf | statistics::oneline);
 
     storeLatencyDist
-        .init(0, 1600000, 10000)
+        .init(0, 1600000-1, 10000)
         .flags(statistics::pdf | statistics::oneline);
 
     initToCoalesceLatency
-        .init(0, 1600000, 10000)
+        .init(0, 1600000-1, 10000)
         .flags(statistics::pdf | statistics::oneline);
 
     rubyNetworkLatency
-        .init(0, 1600000, 10000)
+        .init(0, 1600000-1, 10000)
         .flags(statistics::pdf | statistics::oneline);
 
     gmEnqueueLatency
-        .init(0, 1600000, 10000)
+        .init(0, 1600000-1, 10000)
         .flags(statistics::pdf | statistics::oneline);
 
     gmToCompleteLatency
-        .init(0, 1600000, 10000)
+        .init(0, 1600000-1, 10000)
         .flags(statistics::pdf | statistics::oneline);
 
     coalsrLineAddresses
@@ -612,7 +647,7 @@ Shader::ShaderStats::ShaderStats(statistics::Group *parent, int wf_size)
         ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
                  static_cast<Shader*>(parent)->name(), idx);
         cacheBlockRoundTrip[idx]
-            .init(0, 1600000, 10000)
+            .init(0, 1600000-1, 10000)
             .name(namestr.str())
             .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
             .flags(statistics::pdf | statistics::oneline);
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 32ddf3d15b..c68f4d15b6 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -101,6 +101,14 @@ class Shader : public ClockedObject
     // shader to complete before actually exiting so that stats are updated.
     bool kernelExitRequested = false;
 
+    // Set to true by the dispatcher if the current kernel is a blit kernel
+    bool blitKernel = false;
+
+    // Number of pending non-instruction invalidates outstanding. The shader
+    // should wait for these to be done to ensure correctness.
+    int num_outstanding_invl2s = 0;
+    std::vector<std::tuple<void *, uint32_t, Addr>> deferred_dispatches;
+
   public:
     typedef ShaderParams Params;
     enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -237,6 +245,8 @@ class Shader : public ClockedObject
     int n_cu;
     // Number of wavefront slots per SIMD per CU
     int n_wf;
+    //Number of cu units per sqc in the shader
+    int n_cu_per_sqc;
 
     // The size of global memory
     int globalMemSize;
@@ -319,11 +329,19 @@ class Shader : public ClockedObject
     }
 
     void
-    requestKernelExitEvent()
+    requestKernelExitEvent(bool is_blit_kernel)
     {
         kernelExitRequested = true;
+        blitKernel = is_blit_kernel;
     }
 
+    void decNumOutstandingInvL2s();
+    void incNumOutstandingInvL2s() { num_outstanding_invl2s++; };
+    int getNumOutstandingInvL2s() const { return num_outstanding_invl2s; };
+
+    void addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
+                             Addr host_pkt_addr);
+
   protected:
     struct ShaderStats : public statistics::Group
     {
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
index b5f17c82d0..29c26592f9 100644
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -38,6 +38,7 @@
 #include "debug/GPUVRF.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/simple_pool_manager.hh"
 #include "gpu-compute/wavefront.hh"
 #include "params/VectorRegisterFile.hh"
@@ -58,29 +59,40 @@ VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams &p)
 bool
 VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
 {
+    bool src_ready = true, dst_ready=true;
     for (const auto& srcVecOp : ii->srcVecRegOperands()) {
         for (const auto& physIdx : srcVecOp.physIndices()) {
-            if (regBusy(physIdx)) {
+            if (regBusy(physIdx) &&
+                    !computeUnit->rfc[simdId]->inRFC(physIdx)) {
                 DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
                         w->wfDynId, ii->disassemble(), physIdx);
                 w->stats.numTimesBlockedDueRAWDependencies++;
-                return false;
+                src_ready = false;
+                break;
             }
         }
+        if (!src_ready) {
+            break;
+        }
     }
 
     for (const auto& dstVecOp : ii->dstVecRegOperands()) {
         for (const auto& physIdx : dstVecOp.physIndices()) {
-            if (regBusy(physIdx)) {
+            if (regBusy(physIdx) &&
+                    !computeUnit->rfc[simdId]->inRFC(physIdx)) {
                 DPRINTF(GPUVRF, "WAX stall: WV[%d]: %s: physReg[%d]\n",
                         w->wfDynId, ii->disassemble(), physIdx);
                 w->stats.numTimesBlockedDueWAXDependencies++;
-                return false;
+                dst_ready = false;
+                break;
             }
         }
+        if (!dst_ready) {
+            break;
+        }
     }
 
-    return true;
+    return src_ready && dst_ready;
 }
 
 void
@@ -114,6 +126,22 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
     int DWords = ii->numSrcVecDWords();
     stats.registerReads += (DWords * w->execMask().count());
 
+    for (const auto& dstVecOp : ii->dstVecRegOperands()) {
+        for (const auto& physIdx : dstVecOp.physIndices()) {
+            if (computeUnit->rfc[simdId]->inRFC(physIdx)) {
+                stats.rfc_cache_write_hits += w->execMask().count();
+            }
+        }
+    }
+
+    for (const auto& srcVecOp : ii->srcVecRegOperands()) {
+        for (const auto& physIdx : srcVecOp.physIndices()) {
+            if (computeUnit->rfc[simdId]->inRFC(physIdx)) {
+                stats.rfc_cache_read_hits += w->execMask().count();
+            }
+        }
+    }
+
     uint64_t mask = w->execMask().to_ullong();
     int srams = w->execMask().size() / 4;
     for (int i = 0; i < srams; i++) {
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 0bca152e08..d14f8aee3c 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -37,6 +37,7 @@
 #include "debug/WavefrontStack.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/simple_pool_manager.hh"
@@ -117,6 +118,7 @@ void
 Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
 {
     int regInitIdx = 0;
+    gfxVersion = task->gfxVersion();
 
     // Iterate over all the init fields and check which
     // bits are enabled. Useful information can be found here:
@@ -126,7 +128,6 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
 
         if (task->sgprBitEnabled(en_bit)) {
             int physSgprIdx = 0;
-            uint32_t wiCount = 0;
             uint32_t firstWave = 0;
             int orderedAppendTerm = 0;
             int numWfsInWg = 0;
@@ -341,48 +342,6 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
                         wfSlotId, wfDynId, physSgprIdx,
                         task->privMemPerItem());
                 break;
-              case GridWorkgroupCountX:
-                physSgprIdx =
-                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
-                wiCount = ((task->gridSize(0) +
-                           task->wgSize(0) - 1) /
-                           task->wgSize(0));
-                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
-
-                ++regInitIdx;
-                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
-                        "Setting num WG X: s[%d] = %x\n",
-                        computeUnit->cu_id, simdId,
-                        wfSlotId, wfDynId, physSgprIdx, wiCount);
-                break;
-              case GridWorkgroupCountY:
-                physSgprIdx =
-                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
-                wiCount = ((task->gridSize(1) +
-                           task->wgSize(1) - 1) /
-                           task->wgSize(1));
-                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
-
-                ++regInitIdx;
-                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
-                        "Setting num WG Y: s[%d] = %x\n",
-                        computeUnit->cu_id, simdId,
-                        wfSlotId, wfDynId, physSgprIdx, wiCount);
-                break;
-              case GridWorkgroupCountZ:
-                physSgprIdx =
-                    computeUnit->registerManager->mapSgpr(this, regInitIdx);
-                wiCount = ((task->gridSize(2) +
-                           task->wgSize(2) - 1) /
-                           task->wgSize(2));
-                computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
-
-                ++regInitIdx;
-                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
-                        "Setting num WG Z: s[%d] = %x\n",
-                        computeUnit->cu_id, simdId,
-                        wfSlotId, wfDynId, physSgprIdx, wiCount);
-                break;
               case WorkgroupIdX:
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
@@ -420,8 +379,28 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
                         wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
                 break;
               case PrivSegWaveByteOffset:
+
+                // For architected flat scratch, this enable is reused to set
+                // the FLAT_SCRATCH register pair to the scratch backing
+                // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
+                if (task->gfxVersion() == GfxVersion::gfx942) {
+                    archFlatScratchAddr =
+                        task->amdQueue.scratch_backing_memory_location;
+
+                    DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                            "Setting architected flat scratch = %x\n",
+                            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+                            archFlatScratchAddr);
+
+                    break;
+                }
+
+                // Not architected flat scratch. Write the scratch wavefront
+                // offset: https://llvm.org/docs/AMDGPUUsage.html
+                //              #amdgpu-amdhsa-initial-kernel-execution-state
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
+
                 /**
                   * the compute_tmpring_size_wavesize specifies the number of
                   * kB allocated per wavefront, hence the multiplication by
@@ -472,10 +451,54 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
         }
     }
 
+    // Save the offset to the first accumulation VGPR number from HSA task.
+    accumOffset = task->accumOffset();
+
     regInitIdx = 0;
 
-    // iterate over all the init fields and check which
-    // bits are enabled
+    // VGPRs are initialized to the work item IDs for a given thread. There
+    // are two ways to initialize the IDs based on number of dimensions. ISAs
+    // will either have packed work-item IDs or not. LLVM lists them here:
+    // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table
+    // Default to false and set to true for gem5 supported ISAs.
+    bool packed_work_item_id = false;
+
+    if (task->gfxVersion() == GfxVersion::gfx90a ||
+        task->gfxVersion() == GfxVersion::gfx942) {
+        packed_work_item_id = true;
+    }
+
+    // For ISAs with packed work item IDs, only one VGPR is used and the
+    // (X,Y,Z) dimensions are packed into a single 32-bit VGPR with 10-bits
+    // for each dimension
+    if (packed_work_item_id) {
+        TheGpuISA::VecRegContainerU32 raw_vgpr;
+        TheGpuISA::VecElemU32 *packed_vgpr
+            = raw_vgpr.as<TheGpuISA::VecElemU32>();
+
+        uint32_t physVgprIdx = computeUnit->registerManager
+            ->mapVgpr(this, regInitIdx);
+        for (int lane = 0; lane < workItemId[0].size(); ++lane) {
+            packed_vgpr[lane] = workItemId[0][lane] & 0x3ff;
+        }
+        if (task->vgprBitEnabled(1)) {
+            for (int lane = 0; lane < workItemId[1].size(); ++lane) {
+                packed_vgpr[lane] |= ((workItemId[1][lane] & 0x3ff) << 10);
+            }
+        }
+        if (task->vgprBitEnabled(2)) {
+            for (int lane = 0; lane < workItemId[2].size(); ++lane) {
+                packed_vgpr[lane] |= ((workItemId[2][lane] & 0x3ff) << 20);
+            }
+        }
+        computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
+
+        return;
+    }
+
+    // For ISAs with non-packed work item IDs, map and initialize one VGPR
+    // per dimensions. Do this by iterating over all the init fields and
+    // checking which bits are enabled.
     for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
         if (task->vgprBitEnabled(en_bit)) {
             uint32_t physVgprIdx = 0;
@@ -933,6 +956,7 @@ Wavefront::exec()
     // inform VRF of instruction execution to schedule write-back
     // and scoreboard ready for registers
     if (!ii->isScalar()) {
+        computeUnit->rfc[simdId]->waveExecuteInst(this, ii);
         computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
     }
     computeUnit->srf[simdId]->waveExecuteInst(this, ii);
@@ -1003,6 +1027,14 @@ Wavefront::exec()
         computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
         computeUnit->stats.numVecOpsExecuted += num_active_lanes;
 
+        if (ii->isMFMA()) {
+            computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;
+            if (ii->isI8()) {
+                computeUnit->stats.numVecOpsExecutedMFMAI8
+                    += num_active_lanes;
+            }
+        }
+
         if (ii->isF16() && ii->isALU()) {
             if (ii->isF32() || ii->isF64()) {
                 fatal("Instruction is tagged as both (1) F16, and (2)"
@@ -1024,6 +1056,10 @@ Wavefront::exec()
                 computeUnit->stats.numVecOpsExecutedTwoOpFP
                     += num_active_lanes;
             }
+            else if (ii->isMFMA()) {
+                computeUnit->stats.numVecOpsExecutedMFMAF16
+                    += num_active_lanes;
+            }
         }
         if (ii->isF32() && ii->isALU()) {
             if (ii->isF16() || ii->isF64()) {
@@ -1046,6 +1082,10 @@ Wavefront::exec()
                 computeUnit->stats.numVecOpsExecutedTwoOpFP
                     += num_active_lanes;
             }
+            else if (ii->isMFMA()) {
+                computeUnit->stats.numVecOpsExecutedMFMAF32
+                    += num_active_lanes;
+            }
         }
         if (ii->isF64() && ii->isALU()) {
             if (ii->isF16() || ii->isF32()) {
@@ -1068,6 +1108,10 @@ Wavefront::exec()
                 computeUnit->stats.numVecOpsExecutedTwoOpFP
                     += num_active_lanes;
             }
+            else if (ii->isMFMA()) {
+                computeUnit->stats.numVecOpsExecutedMFMAF64
+                    += num_active_lanes;
+            }
         }
         if (isGmInstruction(ii)) {
             computeUnit->stats.activeLanesPerGMemInstrDist.sample(
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 43ac3e9ffc..476393603b 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -92,6 +92,8 @@ class Wavefront : public SimObject
         S_BARRIER
     };
 
+    // gfx version wavefront is executing
+    GfxVersion gfxVersion;
     // HW slot id where the WF is mapped to inside a SIMD unit
     const int wfSlotId;
     int kernId;
@@ -131,6 +133,8 @@ class Wavefront : public SimObject
     uint32_t maxVgprs;
     // number of SGPRs required by WF
     uint32_t maxSgprs;
+    // first accumulation vgpr number
+    uint32_t accumOffset;
     void freeResources();
     GPUDynInstPtr nextInstr();
     void setStatus(status_e newStatus);
@@ -201,6 +205,9 @@ class Wavefront : public SimObject
     // will live while the WF is executed
     uint32_t startSgprIndex;
 
+    // Architected flat scratch address for MI300+
+    Addr archFlatScratchAddr = 0;
+
     // Old value of destination gpr (for trace)
     std::vector<uint32_t> oldVgpr;
     // Id of destination gpr (for trace)
diff --git a/src/mem/AbstractMemory.py b/src/mem/AbstractMemory.py
index 57e47adcb1..f00e2868da 100644
--- a/src/mem/AbstractMemory.py
+++ b/src/mem/AbstractMemory.py
@@ -76,3 +76,12 @@ class AbstractMemory(ClockedObject):
     )
 
     writeable = Param.Bool(True, "Allow writes to this memory")
+
+    collect_stats = Param.Bool(
+        True,
+        "Collect statistics per requestor for "
+        "each type of access. Set this to `False` if "
+        "requestors may be unknown or when running "
+        "with multiple `System` objects without a "
+        "`SysBridge`.",
+    )
diff --git a/src/mem/abstract_mem.cc b/src/mem/abstract_mem.cc
index 9340f7e96f..461fd4c1fe 100644
--- a/src/mem/abstract_mem.cc
+++ b/src/mem/abstract_mem.cc
@@ -63,8 +63,8 @@ AbstractMemory::AbstractMemory(const Params &p) :
                  MemBackdoor::Readable | MemBackdoor::Writeable :
                  MemBackdoor::Readable)),
     confTableReported(p.conf_table_reported), inAddrMap(p.in_addr_map),
-    kvmMap(p.kvm_map), writeable(p.writeable), _system(NULL),
-    stats(*this)
+    kvmMap(p.kvm_map), writeable(p.writeable), collectStats(p.collect_stats),
+    _system(NULL), stats(*this)
 {
     panic_if(!range.valid() || !range.size(),
              "Memory range %s must be valid with non-zero size.",
@@ -433,7 +433,9 @@ AbstractMemory::access(PacketPtr pkt)
 
             assert(!pkt->req->isInstFetch());
             TRACE_PACKET("Read/Write");
-            stats.numOther[pkt->req->requestorId()]++;
+            if (collectStats) {
+                stats.numOther[pkt->req->requestorId()]++;
+            }
         }
     } else if (pkt->isRead()) {
         assert(!pkt->isWrite());
@@ -447,10 +449,13 @@ AbstractMemory::access(PacketPtr pkt)
             pkt->setData(host_addr);
         }
         TRACE_PACKET(pkt->req->isInstFetch() ? "IFetch" : "Read");
-        stats.numReads[pkt->req->requestorId()]++;
-        stats.bytesRead[pkt->req->requestorId()] += pkt->getSize();
-        if (pkt->req->isInstFetch())
-            stats.bytesInstRead[pkt->req->requestorId()] += pkt->getSize();
+        if (collectStats) {
+            stats.numReads[pkt->req->requestorId()]++;
+            stats.bytesRead[pkt->req->requestorId()] += pkt->getSize();
+            if (pkt->req->isInstFetch()) {
+                stats.bytesInstRead[pkt->req->requestorId()] += pkt->getSize();
+            }
+        }
     } else if (pkt->isInvalidate() || pkt->isClean()) {
         assert(!pkt->isWrite());
         // in a fastmem system invalidating and/or cleaning packets
@@ -466,8 +471,10 @@ AbstractMemory::access(PacketPtr pkt)
             }
             assert(!pkt->req->isInstFetch());
             TRACE_PACKET("Write");
-            stats.numWrites[pkt->req->requestorId()]++;
-            stats.bytesWritten[pkt->req->requestorId()] += pkt->getSize();
+            if (collectStats) {
+                stats.numWrites[pkt->req->requestorId()]++;
+                stats.bytesWritten[pkt->req->requestorId()] += pkt->getSize();
+            }
         }
     } else {
         panic("Unexpected packet %s", pkt->print());
diff --git a/src/mem/abstract_mem.hh b/src/mem/abstract_mem.hh
index 7f12487421..8c85f4503e 100644
--- a/src/mem/abstract_mem.hh
+++ b/src/mem/abstract_mem.hh
@@ -132,6 +132,9 @@ class AbstractMemory : public ClockedObject
     // Are writes allowed to this memory
     const bool writeable;
 
+    // Should collect traffic statistics
+    const bool collectStats;
+
     std::list<LockedAddr> lockedAddrList;
 
     // helper function for checkLockedAddrs(): we really want to
diff --git a/src/mem/cache/Cache.py b/src/mem/cache/Cache.py
index c1b1b95648..fdc1d8fc0f 100644
--- a/src/mem/cache/Cache.py
+++ b/src/mem/cache/Cache.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012-2013, 2015, 2018, 2023 ARM Limited
+# Copyright (c) 2012-2013, 2015, 2018, 2023-2024 ARM Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -111,6 +111,9 @@ class BaseCache(ClockedObject):
     replacement_policy = Param.BaseReplacementPolicy(
         LRURP(), "Replacement policy"
     )
+    partitioning_manager = Param.PartitionManager(
+        NULL, "Cache partitioning manager"
+    )
 
     compressor = Param.BaseCacheCompressor(NULL, "Cache compressor.")
     replace_expansions = Param.Bool(
diff --git a/src/mem/cache/SConscript b/src/mem/cache/SConscript
index dd8f2b145b..248eb6db6e 100644
--- a/src/mem/cache/SConscript
+++ b/src/mem/cache/SConscript
@@ -1,5 +1,16 @@
 # -*- mode:python -*-
-
+# Copyright (c) 2023-2024 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
 # Copyright (c) 2006 The Regents of The University of Michigan
 # All rights reserved.
 #
@@ -50,9 +61,11 @@ DebugFlag('CacheVerbose')
 DebugFlag('HWPrefetch')
 DebugFlag('MSHR')
 DebugFlag('HWPrefetchQueue')
+DebugFlag('PartitionPolicy')
 
 # CacheTags is so outrageously verbose, printing the cache's entire tag
 # array on each timing access, that you should probably have to ask for
 # it explicitly even above and beyond CacheAll.
 CompoundFlag('CacheAll', ['Cache', 'CacheComp', 'CachePort', 'CacheRepl',
-                          'CacheVerbose', 'HWPrefetch', 'MSHR'])
+                          'CacheVerbose', 'HWPrefetch', 'MSHR',
+                          'PartitionPolicy'])
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index e738167066..df0243a0fb 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013, 2018-2019, 2023 ARM Limited
+ * Copyright (c) 2012-2013, 2018-2019, 2023-2024 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -58,6 +58,7 @@
 #include "mem/cache/prefetch/base.hh"
 #include "mem/cache/queue_entry.hh"
 #include "mem/cache/tags/compressed_tags.hh"
+#include "mem/cache/tags/partitioning_policies/partition_manager.hh"
 #include "mem/cache/tags/super_blk.hh"
 #include "params/BaseCache.hh"
 #include "params/WriteAllocator.hh"
@@ -86,6 +87,7 @@ BaseCache::BaseCache(const BaseCacheParams &p, unsigned blk_size)
       writeBuffer("write buffer", p.write_buffers, p.mshrs, p.name),
       tags(p.tags),
       compressor(p.compressor),
+      partitionManager(p.partitioning_manager),
       prefetcher(p.prefetcher),
       writeAllocator(p.write_allocator),
       writebackClean(p.writeback_clean),
@@ -1030,7 +1032,8 @@ BaseCache::updateCompressionData(CacheBlk *&blk, const uint64_t* data,
         CacheBlk *victim = nullptr;
         if (replaceExpansions || is_data_contraction) {
             victim = tags->findVictim(regenerateBlkAddr(blk),
-                blk->isSecure(), compression_size, evict_blks);
+                blk->isSecure(), compression_size, evict_blks,
+                blk->getPartitionId());
 
             // It is valid to return nullptr if there is no victim
             if (!victim) {
@@ -1639,10 +1642,13 @@ BaseCache::allocateBlock(const PacketPtr pkt, PacketList &writebacks)
         blk_size_bits = comp_data->getSizeBits();
     }
 
+    // get partitionId from Packet
+    const auto partition_id = partitionManager ?
+        partitionManager->readPacketPartitionID(pkt) : 0;
     // Find replacement victim
     std::vector<CacheBlk*> evict_blks;
     CacheBlk *victim = tags->findVictim(addr, is_secure, blk_size_bits,
-                                        evict_blks);
+                                        evict_blks, partition_id);
 
     // It is valid to return nullptr if there is no victim
     if (!victim)
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index c2d9bc8a7b..af440e8b1e 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013, 2015-2016, 2018-2019, 2023 ARM Limited
+ * Copyright (c) 2012-2013, 2015-2016, 2018-2019, 2023-2024 Arm Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -84,6 +84,10 @@ namespace prefetch
 {
     class Base;
 }
+namespace partitioning_policy
+{
+    class PartitionManager;
+}
 class MSHR;
 class RequestPort;
 class QueueEntry;
@@ -351,6 +355,9 @@ class BaseCache : public ClockedObject
     /** Compression method being used. */
     compression::Base* compressor;
 
+    /** Partitioning manager */
+    partitioning_policy::PartitionManager* partitionManager;
+
     /** Prefetcher */
     prefetch::Base *prefetcher;
 
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index 3357d5e1b2..ed1e52edc2 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -861,7 +861,7 @@ Cache::serviceMSHRTargets(MSHR *mshr, const PacketPtr pkt, CacheBlk *blk)
                 }
                 // not a cache fill, just forwarding response
                 // responseLatency is the latency of the return path
-                // from lower level cahces/memory to the core.
+                // from lower level caches/memory to the core.
                 completion_time += clockEdge(responseLatency) +
                     pkt->payloadDelay;
                 if (!is_error) {
diff --git a/src/mem/cache/cache_blk.cc b/src/mem/cache/cache_blk.cc
index 3ae37c1b6d..d1f8abcd74 100644
--- a/src/mem/cache/cache_blk.cc
+++ b/src/mem/cache/cache_blk.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013 ARM Limited
+ * Copyright (c) 2012-2013, 2023-2024 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -48,7 +48,8 @@ namespace gem5
 
 void
 CacheBlk::insert(const Addr tag, const bool is_secure,
-                 const int src_requestor_ID, const uint32_t task_ID)
+                 const int src_requestor_ID, const uint32_t task_ID,
+                 const uint64_t partition_id)
 {
     // Make sure that the block has been properly invalidated
     assert(!isValid());
@@ -61,6 +62,9 @@ CacheBlk::insert(const Addr tag, const bool is_secure,
     // Set task ID
     setTaskId(task_ID);
 
+    // Set partition ID
+    setPartitionId(partition_id);
+
     // Set insertion tick as current tick
     setTickInserted();
 
diff --git a/src/mem/cache/cache_blk.hh b/src/mem/cache/cache_blk.hh
index e476ab639d..ad84def8e4 100644
--- a/src/mem/cache/cache_blk.hh
+++ b/src/mem/cache/cache_blk.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2018 ARM Limited
+ * Copyright (c) 2012-2018, 2023-2024 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -49,6 +49,7 @@
 #include <cassert>
 #include <cstdint>
 #include <iosfwd>
+#include <limits>
 #include <list>
 #include <string>
 
@@ -183,6 +184,7 @@ class CacheBlk : public TaggedEntry
         }
         setCoherenceBits(other.coherence);
         setTaskId(other.getTaskId());
+        setPartitionId(other.getPartitionId());
         setWhenReady(curTick());
         setRefCount(other.getRefCount());
         setSrcRequestorId(other.getSrcRequestorId());
@@ -205,6 +207,7 @@ class CacheBlk : public TaggedEntry
         clearCoherenceBits(AllBits);
 
         setTaskId(context_switch_task_id::Unknown);
+        setPartitionId(std::numeric_limits<uint64_t>::max());
         setWhenReady(MaxTick);
         setRefCount(0);
         setSrcRequestorId(Request::invldRequestorId);
@@ -287,6 +290,9 @@ class CacheBlk : public TaggedEntry
     /** Get the requestor id associated to this block. */
     uint32_t getSrcRequestorId() const { return _srcRequestorId; }
 
+    /** Getter for _partitionId */
+    uint64_t getPartitionId() const { return _partitionId; }
+
     /** Get the number of references to this block since insertion. */
     unsigned getRefCount() const { return _refCount; }
 
@@ -315,9 +321,11 @@ class CacheBlk : public TaggedEntry
      * @param is_secure Whether the block is in secure space or not.
      * @param src_requestor_ID The source requestor ID.
      * @param task_ID The new task ID.
+     * @param partition_id The source partition ID.
      */
     void insert(const Addr tag, const bool is_secure,
-        const int src_requestor_ID, const uint32_t task_ID);
+        const int src_requestor_ID, const uint32_t task_ID,
+        const uint64_t partition_id);
     using TaggedEntry::insert;
 
     /**
@@ -466,6 +474,10 @@ class CacheBlk : public TaggedEntry
     /** Set the source requestor id. */
     void setSrcRequestorId(const uint32_t id) { _srcRequestorId = id; }
 
+    /** Setter for _partitionId */
+    void
+    setPartitionId(const uint64_t partitionId) { _partitionId = partitionId; }
+
     /** Set the number of references to this block since insertion. */
     void setRefCount(const unsigned count) { _refCount = count; }
 
@@ -479,6 +491,10 @@ class CacheBlk : public TaggedEntry
     /** holds the source requestor ID for this block. */
     int _srcRequestorId = 0;
 
+    /** Partition ID of the activity that allocated this block */
+    /* This ID is used to enforce resource partitioning policies */
+    uint64_t _partitionId;
+
     /** Number of references to this block since it was brought in. */
     unsigned _refCount = 0;
 
diff --git a/src/mem/cache/compressors/frequent_values.cc b/src/mem/cache/compressors/frequent_values.cc
index b5eca3b096..6930feedd8 100644
--- a/src/mem/cache/compressors/frequent_values.cc
+++ b/src/mem/cache/compressors/frequent_values.cc
@@ -51,8 +51,9 @@ FrequentValues::FrequentValues(const Params &p)
     codeGenerationTicks(p.code_generation_ticks),
     checkSaturation(p.check_saturation), numVFTEntries(p.vft_entries),
     numSamples(p.num_samples), takenSamples(0), phase(SAMPLING),
-    VFT(p.vft_assoc, p.vft_entries, p.vft_indexing_policy,
-      p.vft_replacement_policy, VFTEntry(counterBits)),
+    VFT((name() + ".VFT").c_str(),
+        p.vft_entries, p.vft_assoc, p.vft_replacement_policy,
+        p.vft_indexing_policy, VFTEntry(counterBits)),
     codeGenerationEvent([this]{ phase = COMPRESSING; }, name())
 {
     fatal_if((numVFTEntries - 1) > mask(chunkSizeBits),
@@ -75,7 +76,7 @@ FrequentValues::compress(const std::vector<Chunk>& chunks, Cycles& comp_lat,
         encoder::Code code;
         int length = 0;
         if (phase == COMPRESSING) {
-            VFTEntry* entry = VFT.findEntry(chunk, false);
+            VFTEntry* entry = VFT.findEntry(chunk);
 
             // Theoretically, the code would be the index of the entry;
             // however, there is no practical need to do so, and we simply
@@ -159,7 +160,7 @@ FrequentValues::decompress(const CompressionData* comp_data, uint64_t* data)
                 // The value at the given VFT entry must match the one stored,
                 // if it is not the uncompressed value
                 assert((comp_chunk.code.code == uncompressedValue) ||
-                    VFT.findEntry(comp_chunk.value, false));
+                       VFT.findEntry(comp_chunk.value));
             }
         }
 
@@ -178,7 +179,7 @@ FrequentValues::sampleValues(const std::vector<uint64_t> &data,
 {
     const std::vector<Chunk> chunks = toChunks(data.data());
     for (const Chunk& chunk : chunks) {
-        VFTEntry* entry = VFT.findEntry(chunk, false);
+        VFTEntry* entry = VFT.findEntry(chunk);
         bool saturated = false;
         if (!is_invalidation) {
             // If a VFT hit, increase new value's counter; otherwise, insert
@@ -187,7 +188,7 @@ FrequentValues::sampleValues(const std::vector<uint64_t> &data,
                 entry = VFT.findVictim(chunk);
                 assert(entry != nullptr);
                 entry->value = chunk;
-                VFT.insertEntry(chunk, false, entry);
+                VFT.insertEntry(chunk, entry);
             } else {
                 VFT.accessEntry(entry);
             }
@@ -234,7 +235,7 @@ FrequentValues::generateCodes()
     // representing uncompressed values
     assert(uncompressed_values.size() >= 1);
     uncompressedValue = *uncompressed_values.begin();
-    assert(VFT.findEntry(uncompressedValue, false) == nullptr);
+    assert(VFT.findEntry(uncompressedValue) == nullptr);
 
     if (useHuffmanEncoding) {
         // Populate the queue, adding each entry as a tree with one node.
diff --git a/src/mem/cache/compressors/frequent_values.hh b/src/mem/cache/compressors/frequent_values.hh
index 41103ce594..a5a53c7c8f 100644
--- a/src/mem/cache/compressors/frequent_values.hh
+++ b/src/mem/cache/compressors/frequent_values.hh
@@ -34,13 +34,13 @@
 #include <memory>
 #include <vector>
 
+#include "base/cache/associative_cache.hh"
 #include "base/sat_counter.hh"
 #include "base/types.hh"
 #include "mem/cache/base.hh"
 #include "mem/cache/cache_probe_arg.hh"
 #include "mem/cache/compressors/base.hh"
 #include "mem/cache/compressors/encoders/huffman.hh"
-#include "mem/cache/prefetch/associative_set.hh"
 #include "sim/eventq.hh"
 #include "sim/probe/probe.hh"
 
@@ -112,7 +112,7 @@ class FrequentValues : public Base
     enum Phase {SAMPLING, CODE_GENERATION, COMPRESSING};
     Phase phase;
 
-    class VFTEntry : public TaggedEntry
+    class VFTEntry : public CacheEntry
     {
       public:
         /**
@@ -130,14 +130,14 @@ class FrequentValues : public Base
         SatCounter32 counter;
 
         VFTEntry(std::size_t num_bits)
-          : TaggedEntry(), value(0), counter(num_bits)
+          : CacheEntry(), value(0), counter(num_bits)
         {
         }
 
         void
         invalidate() override
         {
-            TaggedEntry::invalidate();
+            CacheEntry::invalidate();
             value = 0;
             counter.reset();
         }
@@ -147,7 +147,7 @@ class FrequentValues : public Base
      * The Value Frequency Table, a small cache that keeps track and estimates
      * the frequency distribution of values in the cache.
      */
-    AssociativeSet<VFTEntry> VFT;
+    AssociativeCache<VFTEntry> VFT;
 
     /**
      * A pseudo value is used as the representation of uncompressed values.
diff --git a/src/mem/cache/prefetch/Prefetcher.py b/src/mem/cache/prefetch/Prefetcher.py
index 335d6f6942..8f0a66fb69 100644
--- a/src/mem/cache/prefetch/Prefetcher.py
+++ b/src/mem/cache/prefetch/Prefetcher.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, 2014, 2019, 2023 ARM Limited
+# Copyright (c) 2012, 2014, 2019, 2022-2024 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -79,7 +79,7 @@ class BasePrefetcher(ClockedObject):
         "Notify the hardware prefetcher on every access (not just misses)",
     )
     prefetch_on_pf_hit = Param.Bool(
-        False,
+        True,
         "Notify the hardware prefetcher on hit on prefetched lines",
     )
     use_virtual_addresses = Param.Bool(
@@ -191,6 +191,13 @@ class StridePrefetcher(QueuedPrefetcher):
     use_requestor_id = Param.Bool(True, "Use requestor id based history")
 
     degree = Param.Int(4, "Number of prefetches to generate")
+    distance = Param.Unsigned(
+        0,
+        "How far ahead of the demand stream to start prefetching. "
+        "Skip this number of strides ahead of the first identified prefetch, "
+        "then generate `degree` prefetches at `stride` intervals. "
+        "A value of zero indicates no skip.",
+    )
 
     table_assoc = Param.Int(4, "Associativity of the PC table")
     table_entries = Param.MemorySize("64", "Number of entries of the PC table")
diff --git a/src/mem/cache/prefetch/access_map_pattern_matching.cc b/src/mem/cache/prefetch/access_map_pattern_matching.cc
index f53b77868f..f90a7d1eb0 100644
--- a/src/mem/cache/prefetch/access_map_pattern_matching.cc
+++ b/src/mem/cache/prefetch/access_map_pattern_matching.cc
@@ -51,9 +51,11 @@ AccessMapPatternMatching::AccessMapPatternMatching(
       lowCacheHitThreshold(p.low_cache_hit_threshold),
       epochCycles(p.epoch_cycles),
       offChipMemoryLatency(p.offchip_memory_latency),
-      accessMapTable(p.access_map_table_assoc, p.access_map_table_entries,
-                     p.access_map_table_indexing_policy,
+      accessMapTable("AccessMapTable",
+                     p.access_map_table_entries,
+                     p.access_map_table_assoc,
                      p.access_map_table_replacement_policy,
+                     p.access_map_table_indexing_policy,
                      AccessMapEntry(hotZoneSize / blkSize)),
       numGoodPrefetches(0), numTotalPrefetches(0), numRawCacheMisses(0),
       numRawCacheHits(0), degree(startDegree), usefulDegree(startDegree),
diff --git a/src/mem/cache/prefetch/associative_set.hh b/src/mem/cache/prefetch/associative_set.hh
index 7fbbf879e4..b73939fd5d 100644
--- a/src/mem/cache/prefetch/associative_set.hh
+++ b/src/mem/cache/prefetch/associative_set.hh
@@ -29,6 +29,9 @@
 #ifndef __CACHE_PREFETCH_ASSOCIATIVE_SET_HH__
 #define __CACHE_PREFETCH_ASSOCIATIVE_SET_HH__
 
+#include <type_traits>
+
+#include "base/cache/associative_cache.hh"
 #include "mem/cache/replacement_policies/base.hh"
 #include "mem/cache/tags/indexing_policies/base.hh"
 #include "mem/cache/tags/tagged_entry.hh"
@@ -42,38 +45,27 @@ namespace gem5
  * bool value is used as an additional tag data of the entry.
  */
 template<class Entry>
-class AssociativeSet
+class AssociativeSet : public AssociativeCache<Entry>
 {
     static_assert(std::is_base_of_v<TaggedEntry, Entry>,
                   "Entry must derive from TaggedEntry");
 
-    /** Associativity of the container */
-    const int associativity;
-    /**
-     * Total number of entries, entries are organized in sets of the provided
-     * associativity. The number of associative sets is obtained by dividing
-     * numEntries by associativity.
-     */
-    const int numEntries;
-    /** Pointer to the indexing policy */
-    BaseIndexingPolicy* const indexingPolicy;
-    /** Pointer to the replacement policy */
-    replacement_policy::Base* const replacementPolicy;
-    /** Vector containing the entries of the container */
-    std::vector<Entry> entries;
-
   public:
     /**
      * Public constructor
-     * @param assoc number of elements in each associative set
+     * @param name Name of the cache
      * @param num_entries total number of entries of the container, the number
-     *   of sets can be calculated dividing this balue by the 'assoc' value
-     * @param idx_policy indexing policy
+     *        of sets can be calculated dividing this balue by the 'assoc' value
+     * @param assoc number of elements in each associative set
      * @param rpl_policy replacement policy
+     * @param idx_policy indexing policy
      * @param init_val initial value of the elements of the set
      */
-    AssociativeSet(int assoc, int num_entries, BaseIndexingPolicy *idx_policy,
-        replacement_policy::Base *rpl_policy, Entry const &init_val = Entry());
+    AssociativeSet(const char *name, const size_t num_entries,
+                   const size_t associativity_,
+                   replacement_policy::Base *repl_policy,
+                   BaseIndexingPolicy *indexing_policy,
+                   Entry const &init_val = Entry());
 
     /**
      * Find an entry within the set
@@ -84,28 +76,6 @@ class AssociativeSet
      */
     Entry* findEntry(Addr addr, bool is_secure) const;
 
-    /**
-     * Do an access to the entry, this is required to
-     * update the replacement information data.
-     * @param entry the accessed entry
-     */
-    void accessEntry(Entry *entry);
-
-    /**
-     * Find a victim to be replaced
-     * @param addr key to select the possible victim
-     * @result entry to be victimized
-     */
-    Entry* findVictim(Addr addr);
-
-    /**
-     * Find the set of entries that could be replaced given
-     * that we want to add a new entry with the provided key
-     * @param addr key to select the set of entries
-     * @result vector of candidates matching with the provided key
-     */
-    std::vector<Entry *> getPossibleEntries(const Addr addr) const;
-
     /**
      * Indicate that an entry has just been inserted
      * @param addr key of the container
@@ -114,54 +84,16 @@ class AssociativeSet
      */
     void insertEntry(Addr addr, bool is_secure, Entry* entry);
 
-    /**
-     * Invalidate an entry and its respective replacement data.
-     *
-     * @param entry Entry to be invalidated.
-     */
-    void invalidate(Entry* entry);
+  private:
+    // The following APIs are excluded since they lack the secure bit
+    using AssociativeCache<Entry>::getTag;
+    using AssociativeCache<Entry>::accessEntryByAddr;
+    using AssociativeCache<Entry>::findEntry;
+    using AssociativeCache<Entry>::insertEntry;
+    using AssociativeCache<Entry>::getPossibleEntries;
 
-    /** Iterator types */
-    using const_iterator = typename std::vector<Entry>::const_iterator;
-    using iterator = typename std::vector<Entry>::iterator;
-
-    /**
-     * Returns an iterator to the first entry of the dictionary
-     * @result iterator to the first element
-     */
-    iterator begin()
-    {
-        return entries.begin();
-    }
-
-    /**
-     * Returns an iterator pointing to the end of the the dictionary
-     * (placeholder element, should not be accessed)
-     * @result iterator to the end element
-     */
-    iterator end()
-    {
-        return entries.end();
-    }
-
-    /**
-     * Returns an iterator to the first entry of the dictionary
-     * @result iterator to the first element
-     */
-    const_iterator begin() const
-    {
-        return entries.begin();
-    }
-
-    /**
-     * Returns an iterator pointing to the end of the the dictionary
-     * (placeholder element, should not be accessed)
-     * @result iterator to the end element
-     */
-    const_iterator end() const
-    {
-        return entries.end();
-    }
+    using AssociativeCache<Entry>::replPolicy;
+    using AssociativeCache<Entry>::indexingPolicy;
 };
 
 } // namespace gem5
diff --git a/src/mem/cache/prefetch/associative_set_impl.hh b/src/mem/cache/prefetch/associative_set_impl.hh
index c53b19a98a..7385c34207 100644
--- a/src/mem/cache/prefetch/associative_set_impl.hh
+++ b/src/mem/cache/prefetch/associative_set_impl.hh
@@ -35,93 +35,41 @@
 namespace gem5
 {
 
-template<class Entry>
-AssociativeSet<Entry>::AssociativeSet(int assoc, int num_entries,
-        BaseIndexingPolicy *idx_policy, replacement_policy::Base *rpl_policy,
-        Entry const &init_value)
-  : associativity(assoc), numEntries(num_entries), indexingPolicy(idx_policy),
-    replacementPolicy(rpl_policy), entries(numEntries, init_value)
+template <class Entry>
+AssociativeSet<Entry>::AssociativeSet(const char *name,
+                                      const size_t num_entries,
+                                      const size_t associativity_,
+                                      replacement_policy::Base *repl_policy,
+                                      BaseIndexingPolicy *indexing_policy,
+                                      Entry const &init_val)
+  : AssociativeCache<Entry>(name, num_entries, associativity_,
+                            repl_policy, indexing_policy, init_val)
 {
-    fatal_if(!isPowerOf2(num_entries), "The number of entries of an "
-             "AssociativeSet<> must be a power of 2");
-    fatal_if(!isPowerOf2(assoc), "The associativity of an AssociativeSet<> "
-             "must be a power of 2");
-    for (unsigned int entry_idx = 0; entry_idx < numEntries; entry_idx += 1) {
-        Entry* entry = &entries[entry_idx];
-        indexingPolicy->setEntry(entry, entry_idx);
-        entry->replacementData = replacementPolicy->instantiateEntry();
-    }
 }
 
-template<class Entry>
+template <class Entry>
 Entry*
 AssociativeSet<Entry>::findEntry(Addr addr, bool is_secure) const
 {
     Addr tag = indexingPolicy->extractTag(addr);
-    const std::vector<ReplaceableEntry*> selected_entries =
-        indexingPolicy->getPossibleEntries(addr);
+    auto candidates = indexingPolicy->getPossibleEntries(addr);
 
-    for (const auto& location : selected_entries) {
-        Entry* entry = static_cast<Entry *>(location);
-        if ((entry->getTag() == tag) && entry->isValid() &&
-            entry->isSecure() == is_secure) {
+    for (auto candidate : candidates) {
+        Entry* entry = static_cast<Entry*>(candidate);
+        if (entry->matchTag(tag, is_secure)) {
             return entry;
         }
     }
+
     return nullptr;
 }
 
-template<class Entry>
-void
-AssociativeSet<Entry>::accessEntry(Entry *entry)
-{
-    replacementPolicy->touch(entry->replacementData);
-}
-
-template<class Entry>
-Entry*
-AssociativeSet<Entry>::findVictim(Addr addr)
-{
-    // Get possible entries to be victimized
-    const std::vector<ReplaceableEntry*> selected_entries =
-        indexingPolicy->getPossibleEntries(addr);
-    Entry* victim = static_cast<Entry*>(replacementPolicy->getVictim(
-                            selected_entries));
-    // There is only one eviction for this replacement
-    invalidate(victim);
-    return victim;
-}
-
-
-template<class Entry>
-std::vector<Entry *>
-AssociativeSet<Entry>::getPossibleEntries(const Addr addr) const
-{
-    std::vector<ReplaceableEntry *> selected_entries =
-        indexingPolicy->getPossibleEntries(addr);
-    std::vector<Entry *> entries(selected_entries.size(), nullptr);
-
-    unsigned int idx = 0;
-    for (auto &entry : selected_entries) {
-        entries[idx++] = static_cast<Entry *>(entry);
-    }
-    return entries;
-}
-
 template<class Entry>
 void
 AssociativeSet<Entry>::insertEntry(Addr addr, bool is_secure, Entry* entry)
 {
    entry->insert(indexingPolicy->extractTag(addr), is_secure);
-   replacementPolicy->reset(entry->replacementData);
-}
-
-template<class Entry>
-void
-AssociativeSet<Entry>::invalidate(Entry* entry)
-{
-    entry->invalidate();
-    replacementPolicy->invalidate(entry->replacementData);
+   replPolicy->reset(entry->replacementData);
 }
 
 } // namespace gem5
diff --git a/src/mem/cache/prefetch/base.cc b/src/mem/cache/prefetch/base.cc
index f9d2624e7a..a85b024109 100644
--- a/src/mem/cache/prefetch/base.cc
+++ b/src/mem/cache/prefetch/base.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014, 2023 ARM Limited
+ * Copyright (c) 2013-2014, 2022-2024 Arm Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -65,7 +65,7 @@ Base::PrefetchInfo::PrefetchInfo(PacketPtr pkt, Addr addr, bool miss)
     paddress(pkt->req->getPaddr()), cacheMiss(miss)
 {
     unsigned int req_size = pkt->req->getSize();
-    if (!write && miss) {
+    if ((!write && miss) || !pkt->hasData()) {
         data = nullptr;
     } else {
         data = new uint8_t[req_size];
@@ -238,6 +238,7 @@ Base::probeNotify(const CacheAccessProbeArg &acc, bool miss)
     // operations or for writes that we are coaslescing.
     if (pkt->cmd.isSWPrefetch()) return;
     if (pkt->req->isCacheMaintenance()) return;
+    if (pkt->isCleanEviction()) return;
     if (pkt->isWrite() && cache.coalesce()) return;
     if (!pkt->req->hasPaddr()) {
         panic("Request must have a physical address");
diff --git a/src/mem/cache/prefetch/delta_correlating_prediction_tables.cc b/src/mem/cache/prefetch/delta_correlating_prediction_tables.cc
index ea59bea3c0..9df252c1a6 100644
--- a/src/mem/cache/prefetch/delta_correlating_prediction_tables.cc
+++ b/src/mem/cache/prefetch/delta_correlating_prediction_tables.cc
@@ -42,15 +42,16 @@ namespace prefetch
 DeltaCorrelatingPredictionTables::DeltaCorrelatingPredictionTables(
    const DeltaCorrelatingPredictionTablesParams &p) : SimObject(p),
    deltaBits(p.delta_bits), deltaMaskBits(p.delta_mask_bits),
-   table(p.table_assoc, p.table_entries, p.table_indexing_policy,
-         p.table_replacement_policy, DCPTEntry(p.deltas_per_entry))
+   table((name() + "DCPT").c_str(), p.table_entries,
+         p.table_assoc, p.table_replacement_policy,
+         p.table_indexing_policy, DCPTEntry(p.deltas_per_entry))
 {
 }
 
 void
 DeltaCorrelatingPredictionTables::DCPTEntry::invalidate()
 {
-    TaggedEntry::invalidate();
+    CacheEntry::invalidate();
 
     deltas.flush();
     while (!deltas.full()) {
@@ -134,9 +135,8 @@ DeltaCorrelatingPredictionTables::calculatePrefetch(
     }
     Addr address = pfi.getAddr();
     Addr pc = pfi.getPC();
-    // Look up table entry, is_secure is unused in findEntry because we
-    // index using the pc
-    DCPTEntry *entry = table.findEntry(pc, false /* unused */);
+    // Look up table entry
+    DCPTEntry *entry = table.findEntry(pc);
     if (entry != nullptr) {
         entry->addAddress(address, deltaBits);
         //Delta correlating
@@ -144,7 +144,7 @@ DeltaCorrelatingPredictionTables::calculatePrefetch(
     } else {
         entry = table.findVictim(pc);
 
-        table.insertEntry(pc, false /* unused */, entry);
+        table.insertEntry(pc, entry);
 
         entry->lastAddress = address;
     }
diff --git a/src/mem/cache/prefetch/delta_correlating_prediction_tables.hh b/src/mem/cache/prefetch/delta_correlating_prediction_tables.hh
index 7280c96733..482a8807e6 100644
--- a/src/mem/cache/prefetch/delta_correlating_prediction_tables.hh
+++ b/src/mem/cache/prefetch/delta_correlating_prediction_tables.hh
@@ -29,8 +29,8 @@
 #ifndef __MEM_CACHE_PREFETCH_DELTA_CORRELATING_PREDICTION_TABLES_HH_
 #define __MEM_CACHE_PREFETCH_DELTA_CORRELATING_PREDICTION_TABLES_HH_
 
+#include "base/cache/associative_cache.hh"
 #include "base/circular_queue.hh"
-#include "mem/cache/prefetch/associative_set.hh"
 #include "mem/cache/prefetch/queued.hh"
 
 namespace gem5
@@ -65,7 +65,7 @@ class DeltaCorrelatingPredictionTables : public SimObject
     const unsigned int deltaMaskBits;
 
     /** DCPT Table entry datatype */
-    struct DCPTEntry : public TaggedEntry
+    struct DCPTEntry : public CacheEntry
     {
         /** Last accessed address */
         Addr lastAddress;
@@ -77,7 +77,7 @@ class DeltaCorrelatingPredictionTables : public SimObject
          * @param num_deltas number of deltas stored in the entry
          */
         DCPTEntry(unsigned int num_deltas)
-          : TaggedEntry(), lastAddress(0), deltas(num_deltas)
+          : CacheEntry(), lastAddress(0), deltas(num_deltas)
         {
         }
 
@@ -103,7 +103,7 @@ class DeltaCorrelatingPredictionTables : public SimObject
 
     };
     /** The main table */
-    AssociativeSet<DCPTEntry> table;
+    AssociativeCache<DCPTEntry> table;
 
   public:
     DeltaCorrelatingPredictionTables(
diff --git a/src/mem/cache/prefetch/indirect_memory.cc b/src/mem/cache/prefetch/indirect_memory.cc
index 780879bc6a..f689bff38c 100644
--- a/src/mem/cache/prefetch/indirect_memory.cc
+++ b/src/mem/cache/prefetch/indirect_memory.cc
@@ -44,11 +44,15 @@ IndirectMemory::IndirectMemory(const IndirectMemoryPrefetcherParams &p)
     shiftValues(p.shift_values), prefetchThreshold(p.prefetch_threshold),
     streamCounterThreshold(p.stream_counter_threshold),
     streamingDistance(p.streaming_distance),
-    prefetchTable(p.pt_table_assoc, p.pt_table_entries,
-                  p.pt_table_indexing_policy, p.pt_table_replacement_policy,
+    prefetchTable((name() + ".PrefetchTable").c_str(),
+                  p.pt_table_entries,
+                  p.pt_table_assoc,
+                  p.pt_table_replacement_policy,
+                  p.pt_table_indexing_policy,
                   PrefetchTableEntry(p.num_indirect_counter_bits)),
-    ipd(p.ipd_table_assoc, p.ipd_table_entries, p.ipd_table_indexing_policy,
+    ipd((name() + ".IPD").c_str(), p.ipd_table_entries, p.ipd_table_assoc,
         p.ipd_table_replacement_policy,
+        p.ipd_table_indexing_policy,
         IndirectPatternDetectorEntry(p.addr_array_len, shiftValues.size())),
     ipdEntryTrackingMisses(nullptr), byteOrder(p.sys->getGuestByteOrder())
 {
@@ -81,8 +85,7 @@ IndirectMemory::calculatePrefetch(const PrefetchInfo &pfi,
         }
     } else {
         // if misses are not being tracked, attempt to detect stream accesses
-        PrefetchTableEntry *pt_entry =
-            prefetchTable.findEntry(pc, false /* unused */);
+        PrefetchTableEntry *pt_entry = prefetchTable.findEntry(pc);
         if (pt_entry != nullptr) {
             prefetchTable.accessEntry(pt_entry);
 
@@ -156,7 +159,7 @@ IndirectMemory::calculatePrefetch(const PrefetchInfo &pfi,
         } else {
             pt_entry = prefetchTable.findVictim(pc);
             assert(pt_entry != nullptr);
-            prefetchTable.insertEntry(pc, false /* unused */, pt_entry);
+            prefetchTable.insertEntry(pc, pt_entry);
             pt_entry->address = addr;
             pt_entry->secure = is_secure;
         }
@@ -169,8 +172,7 @@ IndirectMemory::allocateOrUpdateIPDEntry(
 {
     // The address of the pt_entry is used to index the IPD
     Addr ipd_entry_addr = (Addr) pt_entry;
-    IndirectPatternDetectorEntry *ipd_entry = ipd.findEntry(ipd_entry_addr,
-                                                            false/* unused */);
+    IndirectPatternDetectorEntry *ipd_entry = ipd.findEntry(ipd_entry_addr);
     if (ipd_entry != nullptr) {
         ipd.accessEntry(ipd_entry);
         if (!ipd_entry->secondIndexSet) {
@@ -187,7 +189,7 @@ IndirectMemory::allocateOrUpdateIPDEntry(
     } else {
         ipd_entry = ipd.findVictim(ipd_entry_addr);
         assert(ipd_entry != nullptr);
-        ipd.insertEntry(ipd_entry_addr, false /* unused */, ipd_entry);
+        ipd.insertEntry(ipd_entry_addr, ipd_entry);
         ipd_entry->idx1 = index;
         ipdEntryTrackingMisses = ipd_entry;
     }
diff --git a/src/mem/cache/prefetch/indirect_memory.hh b/src/mem/cache/prefetch/indirect_memory.hh
index 877e55d63d..1062545301 100644
--- a/src/mem/cache/prefetch/indirect_memory.hh
+++ b/src/mem/cache/prefetch/indirect_memory.hh
@@ -121,7 +121,7 @@ class IndirectMemory : public Queued
         }
     };
     /** Prefetch table */
-    AssociativeSet<PrefetchTableEntry> prefetchTable;
+    AssociativeCache<PrefetchTableEntry> prefetchTable;
 
     /** Indirect Pattern Detector entrt */
     struct IndirectPatternDetectorEntry : public TaggedEntry
@@ -160,7 +160,7 @@ class IndirectMemory : public Queued
         }
     };
     /** Indirect Pattern Detector (IPD) table */
-    AssociativeSet<IndirectPatternDetectorEntry> ipd;
+    AssociativeCache<IndirectPatternDetectorEntry> ipd;
 
     /** Entry currently tracking misses */
     IndirectPatternDetectorEntry *ipdEntryTrackingMisses;
diff --git a/src/mem/cache/prefetch/irregular_stream_buffer.cc b/src/mem/cache/prefetch/irregular_stream_buffer.cc
index bf81ebedc5..7a912ac8b4 100644
--- a/src/mem/cache/prefetch/irregular_stream_buffer.cc
+++ b/src/mem/cache/prefetch/irregular_stream_buffer.cc
@@ -44,19 +44,23 @@ IrregularStreamBuffer::IrregularStreamBuffer(
     chunkSize(p.chunk_size),
     prefetchCandidatesPerEntry(p.prefetch_candidates_per_entry),
     degree(p.degree),
-    trainingUnit(p.training_unit_assoc, p.training_unit_entries,
-                 p.training_unit_indexing_policy,
-                 p.training_unit_replacement_policy),
-    psAddressMappingCache(p.address_map_cache_assoc,
+    trainingUnit((name() + ".TrainingUnit").c_str(),
+                 p.training_unit_entries,
+                 p.training_unit_assoc,
+                 p.training_unit_replacement_policy,
+                 p.training_unit_indexing_policy),
+    psAddressMappingCache((name() + ".PSAddressMappingCache").c_str(),
                           p.address_map_cache_entries,
-                          p.ps_address_map_cache_indexing_policy,
+			  p.address_map_cache_assoc,
                           p.ps_address_map_cache_replacement_policy,
+                          p.ps_address_map_cache_indexing_policy,
                           AddressMappingEntry(prefetchCandidatesPerEntry,
                                               p.num_counter_bits)),
-    spAddressMappingCache(p.address_map_cache_assoc,
+    spAddressMappingCache((name() + ".SPAddressMappingCache").c_str(),
                           p.address_map_cache_entries,
-                          p.sp_address_map_cache_indexing_policy,
+			  p.address_map_cache_assoc,
                           p.sp_address_map_cache_replacement_policy,
+                          p.sp_address_map_cache_indexing_policy,
                           AddressMappingEntry(prefetchCandidatesPerEntry,
                                               p.num_counter_bits)),
     structuralAddressCounter(0)
diff --git a/src/mem/cache/prefetch/pif.cc b/src/mem/cache/prefetch/pif.cc
index 581831491f..138c89f08e 100644
--- a/src/mem/cache/prefetch/pif.cc
+++ b/src/mem/cache/prefetch/pif.cc
@@ -46,8 +46,9 @@ PIF::PIF(const PIFPrefetcherParams &p)
       succSize(p.succ_spatial_region_bits),
       maxCompactorEntries(p.compactor_entries),
       historyBuffer(p.history_buffer_size),
-      index(p.index_assoc, p.index_entries, p.index_indexing_policy,
-            p.index_replacement_policy),
+      index((name() + ".PIFIndex").c_str(), p.index_entries, p.index_assoc,
+            p.index_replacement_policy,
+	    p.index_indexing_policy),
       streamAddressBuffer(p.stream_address_buffer_entries),
       listenersPC()
 {
@@ -176,15 +177,13 @@ PIF::notifyRetiredInst(const Addr pc)
                 // the 'iterator' table to point to the new entry
                 historyBuffer.push_back(spatialCompactor);
 
-                IndexEntry *idx_entry =
-                    index.findEntry(spatialCompactor.trigger, false);
+                auto idx_entry = index.findEntry(spatialCompactor.trigger);
                 if (idx_entry != nullptr) {
                     index.accessEntry(idx_entry);
                 } else {
                     idx_entry = index.findVictim(spatialCompactor.trigger);
                     assert(idx_entry != nullptr);
-                    index.insertEntry(spatialCompactor.trigger, false,
-                                      idx_entry);
+                    index.insertEntry(spatialCompactor.trigger, idx_entry);
                 }
                 idx_entry->historyIt =
                     historyBuffer.getIterator(historyBuffer.tail());
@@ -220,7 +219,7 @@ PIF::calculatePrefetch(const PrefetchInfo &pfi,
 
     // Check if a valid entry in the 'index' table is found and allocate a new
     // active prediction stream
-    IndexEntry *idx_entry = index.findEntry(pc, /* unused */ false);
+    IndexEntry *idx_entry = index.findEntry(pc);
 
     if (idx_entry != nullptr) {
         index.accessEntry(idx_entry);
diff --git a/src/mem/cache/prefetch/pif.hh b/src/mem/cache/prefetch/pif.hh
index ecb97db78d..3592397804 100644
--- a/src/mem/cache/prefetch/pif.hh
+++ b/src/mem/cache/prefetch/pif.hh
@@ -138,11 +138,12 @@ class PIF : public Queued
         {
             HistoryBuffer::iterator historyIt;
         };
+
         /**
          * The index table is a small cache-like structure that facilitates
          * fast search of the history buffer.
          */
-        AssociativeSet<IndexEntry> index;
+        AssociativeCache<IndexEntry> index;
 
         /**
          * A Stream Address Buffer (SAB) tracks a window of consecutive
diff --git a/src/mem/cache/prefetch/queued.cc b/src/mem/cache/prefetch/queued.cc
index e22d6f5eb8..c16e6ce905 100644
--- a/src/mem/cache/prefetch/queued.cc
+++ b/src/mem/cache/prefetch/queued.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014-2015, 2023 ARM Limited
+ * Copyright (c) 2014-2015, 2022-2023 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -180,7 +180,7 @@ Queued::notify(const CacheAccessProbeArg &acc, const PrefetchInfo &pfi)
     if (queueSquash) {
         auto itr = pfq.begin();
         while (itr != pfq.end()) {
-            if (itr->pfInfo.getAddr() == blk_addr &&
+            if (blockAddress(itr->pfInfo.getAddr()) == blk_addr &&
                 itr->pfInfo.isSecure() == is_secure) {
                 DPRINTF(HWPrefetch, "Removing pf candidate addr: %#x "
                         "(cl: %#x), demand request going to the same addr\n",
diff --git a/src/mem/cache/prefetch/signature_path.cc b/src/mem/cache/prefetch/signature_path.cc
index 6d3f05bf17..122c498be7 100644
--- a/src/mem/cache/prefetch/signature_path.cc
+++ b/src/mem/cache/prefetch/signature_path.cc
@@ -48,12 +48,16 @@ SignaturePath::SignaturePath(const SignaturePathPrefetcherParams &p)
       signatureBits(p.signature_bits),
       prefetchConfidenceThreshold(p.prefetch_confidence_threshold),
       lookaheadConfidenceThreshold(p.lookahead_confidence_threshold),
-      signatureTable(p.signature_table_assoc, p.signature_table_entries,
-                     p.signature_table_indexing_policy,
-                     p.signature_table_replacement_policy),
-      patternTable(p.pattern_table_assoc, p.pattern_table_entries,
-                   p.pattern_table_indexing_policy,
+      signatureTable((name() + ".SignatureTable").c_str(),
+                     p.signature_table_entries,
+                     p.signature_table_assoc,
+                     p.signature_table_replacement_policy,
+                     p.signature_table_indexing_policy),
+      patternTable((name() + ".PatternTable").c_str(),
+                   p.pattern_table_entries,
+                   p.pattern_table_assoc,
                    p.pattern_table_replacement_policy,
+                   p.pattern_table_indexing_policy,
                    PatternEntry(stridesPerPatternEntry, p.num_counter_bits))
 {
     fatal_if(prefetchConfidenceThreshold < 0,
@@ -188,7 +192,7 @@ SignaturePath::getSignatureEntry(Addr ppn, bool is_secure,
 SignaturePath::PatternEntry &
 SignaturePath::getPatternEntry(Addr signature)
 {
-    PatternEntry* pattern_entry = patternTable.findEntry(signature, false);
+    PatternEntry* pattern_entry = patternTable.findEntry(signature);
     if (pattern_entry != nullptr) {
         // Signature found
         patternTable.accessEntry(pattern_entry);
@@ -197,7 +201,7 @@ SignaturePath::getPatternEntry(Addr signature)
         pattern_entry = patternTable.findVictim(signature);
         assert(pattern_entry != nullptr);
 
-        patternTable.insertEntry(signature, false, pattern_entry);
+        patternTable.insertEntry(signature, pattern_entry);
     }
     return *pattern_entry;
 }
@@ -273,7 +277,7 @@ SignaturePath::calculatePrefetch(const PrefetchInfo &pfi,
         //   confidence, these are prefetch candidates
         // - select the entry with the highest counter as the "lookahead"
         PatternEntry *current_pattern_entry =
-            patternTable.findEntry(current_signature, false);
+            patternTable.findEntry(current_signature);
         PatternStrideEntry const *lookahead = nullptr;
         if (current_pattern_entry != nullptr) {
             unsigned long max_counter = 0;
diff --git a/src/mem/cache/prefetch/signature_path.hh b/src/mem/cache/prefetch/signature_path.hh
index a561cda063..d5e1ea18ab 100644
--- a/src/mem/cache/prefetch/signature_path.hh
+++ b/src/mem/cache/prefetch/signature_path.hh
@@ -146,8 +146,9 @@ class SignaturePath : public Queued
          */
         PatternStrideEntry &getStrideEntry(stride_t stride);
     };
+
     /** Pattern table */
-    AssociativeSet<PatternEntry> patternTable;
+    AssociativeCache<PatternEntry> patternTable;
 
     /**
      * Generates a new signature from an existing one and a new stride
diff --git a/src/mem/cache/prefetch/signature_path_v2.cc b/src/mem/cache/prefetch/signature_path_v2.cc
index b50721ca69..518fc3e0e7 100644
--- a/src/mem/cache/prefetch/signature_path_v2.cc
+++ b/src/mem/cache/prefetch/signature_path_v2.cc
@@ -42,10 +42,11 @@ namespace prefetch
 
 SignaturePathV2::SignaturePathV2(const SignaturePathPrefetcherV2Params &p)
     : SignaturePath(p),
-      globalHistoryRegister(p.global_history_register_entries,
+      globalHistoryRegister((name() + ".GlobalHistoryRegister").c_str(),
                             p.global_history_register_entries,
-                            p.global_history_register_indexing_policy,
+			    p.global_history_register_entries,
                             p.global_history_register_replacement_policy,
+                            p.global_history_register_indexing_policy,
                             GlobalHistoryEntry())
 {
 }
@@ -124,7 +125,7 @@ SignaturePathV2::handlePageCrossingLookahead(signature_t signature,
     GlobalHistoryEntry *gh_entry = globalHistoryRegister.findVictim(0);
     assert(gh_entry != nullptr);
     // Any address value works, as it is never used
-    globalHistoryRegister.insertEntry(0, false, gh_entry);
+    globalHistoryRegister.insertEntry(0, gh_entry);
 
     gh_entry->signature = signature;
     gh_entry->lastBlock = last_offset;
diff --git a/src/mem/cache/prefetch/signature_path_v2.hh b/src/mem/cache/prefetch/signature_path_v2.hh
index 417b7ec540..ee9e2cdc3c 100644
--- a/src/mem/cache/prefetch/signature_path_v2.hh
+++ b/src/mem/cache/prefetch/signature_path_v2.hh
@@ -66,7 +66,7 @@ class SignaturePathV2 : public SignaturePath
                                delta(0) {}
     };
     /** Global History Register */
-    AssociativeSet<GlobalHistoryEntry> globalHistoryRegister;
+    AssociativeCache<GlobalHistoryEntry> globalHistoryRegister;
 
     double calculateLookaheadConfidence(PatternEntry const &sig,
             PatternStrideEntry const &lookahead) const override;
diff --git a/src/mem/cache/prefetch/spatio_temporal_memory_streaming.cc b/src/mem/cache/prefetch/spatio_temporal_memory_streaming.cc
index 406e444be0..c2f936b176 100644
--- a/src/mem/cache/prefetch/spatio_temporal_memory_streaming.cc
+++ b/src/mem/cache/prefetch/spatio_temporal_memory_streaming.cc
@@ -42,16 +42,18 @@ STeMS::STeMS(const STeMSPrefetcherParams &p)
   : Queued(p), spatialRegionSize(p.spatial_region_size),
     spatialRegionSizeBits(floorLog2(p.spatial_region_size)),
     reconstructionEntries(p.reconstruction_entries),
-    activeGenerationTable(p.active_generation_table_assoc,
+    activeGenerationTable((name() + ".ActiveGenerationTable").c_str(),
                           p.active_generation_table_entries,
-                          p.active_generation_table_indexing_policy,
+			  p.active_generation_table_assoc,
                           p.active_generation_table_replacement_policy,
+                          p.active_generation_table_indexing_policy,
                           ActiveGenerationTableEntry(
                               spatialRegionSize / blkSize)),
-    patternSequenceTable(p.pattern_sequence_table_assoc,
+    patternSequenceTable((name() + ".PatternSequenceTable").c_str(),
                          p.pattern_sequence_table_entries,
-                         p.pattern_sequence_table_indexing_policy,
+			 p.pattern_sequence_table_assoc,
                          p.pattern_sequence_table_replacement_policy,
+                         p.pattern_sequence_table_indexing_policy,
                          ActiveGenerationTableEntry(
                              spatialRegionSize / blkSize)),
     rmob(p.region_miss_order_buffer_entries),
@@ -90,15 +92,12 @@ STeMS::checkForActiveGenerationsEnd(const CacheAccessor &cache)
             }
             if (generation_ended) {
                 // PST is indexed using the PC (secure bit is unused)
-                ActiveGenerationTableEntry *pst_entry =
-                    patternSequenceTable.findEntry(pst_addr,
-                                                   false /*unused*/);
+                auto pst_entry = patternSequenceTable.findEntry(pst_addr);
                 if (pst_entry == nullptr) {
                     // Tipically an entry will not exist
                     pst_entry = patternSequenceTable.findVictim(pst_addr);
                     assert(pst_entry != nullptr);
-                    patternSequenceTable.insertEntry(pst_addr,
-                            false /*unused*/, pst_entry);
+                    patternSequenceTable.insertEntry(pst_addr, pst_entry);
                 } else {
                     patternSequenceTable.accessEntry(pst_entry);
                 }
@@ -222,8 +221,7 @@ STeMS::reconstructSequence(
     idx = 0;
     for (auto it = rmob_it; it != rmob.end() && (idx < reconstructionEntries);
         it++) {
-        ActiveGenerationTableEntry *pst_entry =
-            patternSequenceTable.findEntry(it->pstAddress, false /* unused */);
+        auto pst_entry = patternSequenceTable.findEntry(it->pstAddress);
         if (pst_entry != nullptr) {
             patternSequenceTable.accessEntry(pst_entry);
             for (auto &seq_entry : pst_entry->sequence) {
diff --git a/src/mem/cache/prefetch/spatio_temporal_memory_streaming.hh b/src/mem/cache/prefetch/spatio_temporal_memory_streaming.hh
index c6cd2f72d1..753c6a79b4 100644
--- a/src/mem/cache/prefetch/spatio_temporal_memory_streaming.hh
+++ b/src/mem/cache/prefetch/spatio_temporal_memory_streaming.hh
@@ -155,7 +155,7 @@ class STeMS : public Queued
     /** Active Generation Table (AGT) */
     AssociativeSet<ActiveGenerationTableEntry> activeGenerationTable;
     /** Pattern Sequence Table (PST) */
-    AssociativeSet<ActiveGenerationTableEntry> patternSequenceTable;
+    AssociativeCache<ActiveGenerationTableEntry> patternSequenceTable;
 
     /** Data type of the Region Miss Order Buffer entry */
     struct RegionMissOrderBufferEntry
diff --git a/src/mem/cache/prefetch/stride.cc b/src/mem/cache/prefetch/stride.cc
index b2e4702f20..6335819a96 100644
--- a/src/mem/cache/prefetch/stride.cc
+++ b/src/mem/cache/prefetch/stride.cc
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2018 Inria
- * Copyright (c) 2012-2013, 2015 ARM Limited
+ * Copyright (c) 2012-2013, 2015, 2022-2023 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -84,8 +84,9 @@ Stride::Stride(const StridePrefetcherParams &p)
     threshConf(p.confidence_threshold/100.0),
     useRequestorId(p.use_requestor_id),
     degree(p.degree),
+    distance(p.distance),
     pcTableInfo(p.table_assoc, p.table_entries, p.table_indexing_policy,
-        p.table_replacement_policy)
+                p.table_replacement_policy)
 {
 }
 
@@ -104,16 +105,21 @@ Stride::findTable(int context)
 Stride::PCTable*
 Stride::allocateNewContext(int context)
 {
+    std::string table_name = name() + ".PCTable" + std::to_string(context);
     // Create new table
-    auto insertion_result = pcTables.insert(std::make_pair(context,
-        PCTable(pcTableInfo.assoc, pcTableInfo.numEntries,
-        pcTableInfo.indexingPolicy, pcTableInfo.replacementPolicy,
-        StrideEntry(initConfidence))));
+    auto ins_result = pcTables.emplace(std::piecewise_construct,
+                           std::forward_as_tuple(context),
+                           std::forward_as_tuple(table_name.c_str(),
+                                                 pcTableInfo.numEntries,
+                                                 pcTableInfo.assoc,
+                                                 pcTableInfo.replacementPolicy,
+                                                 pcTableInfo.indexingPolicy,
+                                                 StrideEntry(initConfidence)));
 
     DPRINTF(HWPrefetch, "Adding context %i with stride entries\n", context);
 
     // Get iterator to new pc table, and then return a pointer to the new table
-    return &(insertion_result.first->second);
+    return &(ins_result.first->second);
 }
 
 void
@@ -168,15 +174,16 @@ Stride::calculatePrefetch(const PrefetchInfo &pfi,
             return;
         }
 
+        // Round strides up to atleast 1 cacheline
+        int prefetch_stride = new_stride;
+        if (abs(new_stride) < blkSize) {
+            prefetch_stride = (new_stride < 0) ? -blkSize : blkSize;
+        }
+
+        Addr new_addr = pf_addr + distance * prefetch_stride;
         // Generate up to degree prefetches
         for (int d = 1; d <= degree; d++) {
-            // Round strides up to atleast 1 cacheline
-            int prefetch_stride = new_stride;
-            if (abs(new_stride) < blkSize) {
-                prefetch_stride = (new_stride < 0) ? -blkSize : blkSize;
-            }
-
-            Addr new_addr = pf_addr + d * prefetch_stride;
+            new_addr += prefetch_stride;
             addresses.push_back(AddrPriority(new_addr, 0));
         }
     } else {
diff --git a/src/mem/cache/prefetch/stride.hh b/src/mem/cache/prefetch/stride.hh
index 41cadbe7d0..f610503338 100644
--- a/src/mem/cache/prefetch/stride.hh
+++ b/src/mem/cache/prefetch/stride.hh
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2018 Inria
- * Copyright (c) 2012-2013, 2015 ARM Limited
+ * Copyright (c) 2012-2013, 2015, 2022 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -105,6 +105,14 @@ class Stride : public Queued
 
     const int degree;
 
+    /** How far ahead of the demand stream to start prefetching.
+     *
+     * Skip this number of strides ahead of the first identified
+     * prefetch, then generate `degree` prefetches at `stride`
+     * intervals. A value of zero indicates no skip.
+     */
+    const int distance;
+
     /**
      * Information used to create a new PC table. All of them behave equally.
      */
@@ -117,8 +125,8 @@ class Stride : public Queued
         replacement_policy::Base* const replacementPolicy;
 
         PCTableInfo(int assoc, int num_entries,
-            BaseIndexingPolicy* indexing_policy,
-            replacement_policy::Base* repl_policy)
+                    BaseIndexingPolicy* indexing_policy,
+                    replacement_policy::Base* repl_policy)
           : assoc(assoc), numEntries(num_entries),
             indexingPolicy(indexing_policy), replacementPolicy(repl_policy)
         {
diff --git a/src/mem/cache/replacement_policies/replaceable_entry.hh b/src/mem/cache/replacement_policies/replaceable_entry.hh
index bb88cefd1d..aa07f4ec23 100644
--- a/src/mem/cache/replacement_policies/replaceable_entry.hh
+++ b/src/mem/cache/replacement_policies/replaceable_entry.hh
@@ -73,7 +73,7 @@ class ReplaceableEntry
     uint32_t _way;
 
   public:
-    ReplaceableEntry() = default;
+    ReplaceableEntry() : _set(0), _way(0) {}
     virtual ~ReplaceableEntry() = default;
 
     /**
diff --git a/src/mem/cache/replacement_policies/tree_plru_rp.cc b/src/mem/cache/replacement_policies/tree_plru_rp.cc
index 5014785093..2766916534 100644
--- a/src/mem/cache/replacement_policies/tree_plru_rp.cc
+++ b/src/mem/cache/replacement_policies/tree_plru_rp.cc
@@ -104,8 +104,8 @@ TreePLRU::TreePLRUReplData::TreePLRUReplData(
 TreePLRU::TreePLRU(const Params &p)
   : Base(p), numLeaves(p.num_leaves), count(0), treeInstance(nullptr)
 {
-    fatal_if(!isPowerOf2(numLeaves),
-             "Number of leaves must be non-zero and a power of 2");
+    fatal_if(numLeaves < 1,
+        "numLeaves should never be 0");
 }
 
 void
@@ -192,7 +192,7 @@ TreePLRU::getVictim(const ReplacementCandidates& candidates) const
 
     // The tree index is currently at the leaf of the victim displaced by the
     // number of non-leaf nodes
-    return candidates[tree_index - (numLeaves - 1)];
+    return candidates.at(tree_index - (numLeaves - 1));
 }
 
 std::shared_ptr<ReplacementData>
diff --git a/src/mem/cache/tags/Tags.py b/src/mem/cache/tags/Tags.py
index 2717373829..4c159f8e20 100644
--- a/src/mem/cache/tags/Tags.py
+++ b/src/mem/cache/tags/Tags.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012-2013 ARM Limited
+# Copyright (c) 2012-2013, 2023-2024 ARM Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -75,6 +75,10 @@ class BaseTags(ClockedObject):
         SetAssociative(), "Indexing policy"
     )
 
+    partitioning_manager = Param.PartitionManager(
+        Parent.partitioning_manager, "Cache partitioning manager"
+    )
+
     # Set the indexing entry size as the block size
     entry_size = Param.Int(
         Parent.cache_line_size, "Indexing entry size in bytes"
diff --git a/src/mem/cache/tags/base.cc b/src/mem/cache/tags/base.cc
index 8216f3dfe8..ca6284cf34 100644
--- a/src/mem/cache/tags/base.cc
+++ b/src/mem/cache/tags/base.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013,2016,2018-2019 ARM Limited
+ * Copyright (c) 2013, 2016, 2018-2019, 2023-2024 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -50,6 +50,7 @@
 #include "base/types.hh"
 #include "mem/cache/replacement_policies/replaceable_entry.hh"
 #include "mem/cache/tags/indexing_policies/base.hh"
+#include "mem/cache/tags/partitioning_policies/partition_manager.hh"
 #include "mem/request.hh"
 #include "sim/core.hh"
 #include "sim/sim_exit.hh"
@@ -62,6 +63,7 @@ BaseTags::BaseTags(const Params &p)
     : ClockedObject(p), blkSize(p.block_size), blkMask(blkSize - 1),
       size(p.size), lookupLatency(p.tag_latency),
       system(p.system), indexingPolicy(p.indexing_policy),
+      partitionManager(p.partitioning_manager),
       warmupBound((p.warmup_percentage/100.0) * (p.size / p.block_size)),
       warmedUp(false), numBlocks(p.size / p.block_size),
       dataBlks(new uint8_t[p.size]), // Allocate data storage in one big chunk
@@ -111,9 +113,11 @@ BaseTags::insertBlock(const PacketPtr pkt, CacheBlk *blk)
     assert(requestor_id < system->maxRequestors());
     stats.occupancies[requestor_id]++;
 
-    // Insert block with tag, src requestor id and task id
+    // Insert block with tag, src requestor id, task id and PartitionId
+    const auto partition_id = partitionManager ?
+        partitionManager->readPacketPartitionID(pkt) : 0;
     blk->insert(extractTag(pkt->getAddr()), pkt->isSecure(), requestor_id,
-                pkt->req->taskId());
+                pkt->req->taskId(), partition_id);
 
     // Check if cache warm up is done
     if (!warmedUp && stats.tagsInUse.value() >= warmupBound) {
diff --git a/src/mem/cache/tags/base.hh b/src/mem/cache/tags/base.hh
index c49188151c..d700418937 100644
--- a/src/mem/cache/tags/base.hh
+++ b/src/mem/cache/tags/base.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2014,2016-2019 ARM Limited
+ * Copyright (c) 2012-2014, 2016-2019, 2023-2024 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -88,6 +88,9 @@ class BaseTags : public ClockedObject
     /** Indexing policy */
     BaseIndexingPolicy *indexingPolicy;
 
+    /** Partitioning manager */
+    partitioning_policy::PartitionManager *partitionManager;
+
     /**
      * The number of tags that need to be touched to meet the warmup
      * percentage.
@@ -276,11 +279,13 @@ class BaseTags : public ClockedObject
      * @param is_secure True if the target memory space is secure.
      * @param size Size, in bits, of new block to allocate.
      * @param evict_blks Cache blocks to be evicted.
+     * @param partition_id Partition ID for resource management.
      * @return Cache block to be replaced.
      */
     virtual CacheBlk* findVictim(Addr addr, const bool is_secure,
                                  const std::size_t size,
-                                 std::vector<CacheBlk*>& evict_blks) = 0;
+                                 std::vector<CacheBlk*>& evict_blks,
+                                 const uint64_t partition_id=0) = 0;
 
     /**
      * Access block and update replacement data. May not succeed, in which case
diff --git a/src/mem/cache/tags/base_set_assoc.cc b/src/mem/cache/tags/base_set_assoc.cc
index b0cae8ec45..79bcb1eff5 100644
--- a/src/mem/cache/tags/base_set_assoc.cc
+++ b/src/mem/cache/tags/base_set_assoc.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2014 ARM Limited
+ * Copyright (c) 2012-2014, 2023-2024 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -88,6 +88,11 @@ BaseSetAssoc::tagsInit()
 void
 BaseSetAssoc::invalidate(CacheBlk *blk)
 {
+    // Notify partitioning policies of release of ownership
+    if (partitionManager) {
+        partitionManager->notifyRelease(blk->getPartitionId());
+    }
+
     BaseTags::invalidate(blk);
 
     // Decrease the number of tags in use
diff --git a/src/mem/cache/tags/base_set_assoc.hh b/src/mem/cache/tags/base_set_assoc.hh
index 8ffb7189b7..b68238e449 100644
--- a/src/mem/cache/tags/base_set_assoc.hh
+++ b/src/mem/cache/tags/base_set_assoc.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2014,2017 ARM Limited
+ * Copyright (c) 2012-2014, 2017, 2023-2024 ARM Limited
  * All rights reserved.
  *
  * The license below extends only to copyright in the software and shall
@@ -59,6 +59,7 @@
 #include "mem/cache/replacement_policies/replaceable_entry.hh"
 #include "mem/cache/tags/base.hh"
 #include "mem/cache/tags/indexing_policies/base.hh"
+#include "mem/cache/tags/partitioning_policies/partition_manager.hh"
 #include "mem/packet.hh"
 #include "params/BaseSetAssoc.hh"
 
@@ -163,19 +164,26 @@ class BaseSetAssoc : public BaseTags
      * @param is_secure True if the target memory space is secure.
      * @param size Size, in bits, of new block to allocate.
      * @param evict_blks Cache blocks to be evicted.
+     * @param partition_id Partition ID for resource management.
      * @return Cache block to be replaced.
      */
     CacheBlk* findVictim(Addr addr, const bool is_secure,
                          const std::size_t size,
-                         std::vector<CacheBlk*>& evict_blks) override
+                         std::vector<CacheBlk*>& evict_blks,
+                         const uint64_t partition_id=0) override
     {
         // Get possible entries to be victimized
-        const std::vector<ReplaceableEntry*> entries =
+        std::vector<ReplaceableEntry*> entries =
             indexingPolicy->getPossibleEntries(addr);
 
+        // Filter entries based on PartitionID
+        if (partitionManager) {
+            partitionManager->filterByPartition(entries, partition_id);
+        }
+
         // Choose replacement victim from replacement candidates
-        CacheBlk* victim = static_cast<CacheBlk*>(replacementPolicy->getVictim(
-                                entries));
+        CacheBlk* victim = entries.empty() ? nullptr :
+            static_cast<CacheBlk*>(replacementPolicy->getVictim(entries));
 
         // There is only one eviction for this replacement
         evict_blks.push_back(victim);
@@ -197,6 +205,11 @@ class BaseSetAssoc : public BaseTags
         // Increment tag counter
         stats.tagsInUse++;
 
+        if (partitionManager) {
+            auto partition_id = partitionManager->readPacketPartitionID(pkt);
+            partitionManager->notifyAcquire(partition_id);
+        }
+
         // Update replacement policy
         replacementPolicy->reset(blk->replacementData, pkt);
     }
diff --git a/src/mem/cache/tags/compressed_tags.cc b/src/mem/cache/tags/compressed_tags.cc
index c84718f5f6..352ac8a3ad 100644
--- a/src/mem/cache/tags/compressed_tags.cc
+++ b/src/mem/cache/tags/compressed_tags.cc
@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2023-2024 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 2018 Inria
  * All rights reserved.
  *
@@ -38,6 +50,7 @@
 #include "mem/cache/replacement_policies/base.hh"
 #include "mem/cache/replacement_policies/replaceable_entry.hh"
 #include "mem/cache/tags/indexing_policies/base.hh"
+#include "mem/cache/tags/partitioning_policies/partition_manager.hh"
 #include "mem/packet.hh"
 #include "params/CompressedTags.hh"
 
@@ -104,12 +117,19 @@ CompressedTags::tagsInit()
 CacheBlk*
 CompressedTags::findVictim(Addr addr, const bool is_secure,
                            const std::size_t compressed_size,
-                           std::vector<CacheBlk*>& evict_blks)
+                           std::vector<CacheBlk*>& evict_blks,
+                           const uint64_t partition_id=0)
 {
     // Get all possible locations of this superblock
-    const std::vector<ReplaceableEntry*> superblock_entries =
+    std::vector<ReplaceableEntry*> superblock_entries =
         indexingPolicy->getPossibleEntries(addr);
 
+    // Filter entries based on PartitionID
+    if (partitionManager){
+        partitionManager->filterByPartition(superblock_entries,
+            partition_id);
+    }
+
     // Check if the superblock this address belongs to has been allocated. If
     // so, try co-allocating
     Addr tag = extractTag(addr);
@@ -132,6 +152,13 @@ CompressedTags::findVictim(Addr addr, const bool is_secure,
     // If the superblock is not present or cannot be co-allocated a
     // superblock must be replaced
     if (victim_superblock == nullptr){
+        // check if partitioning policy limited allocation and if true - return
+        // this assumes that superblock_entries would not be empty if
+        // partitioning policy is not in place
+        if (superblock_entries.size() == 0){
+            return nullptr;
+        }
+
         // Choose replacement victim from replacement candidates
         victim_superblock = static_cast<SuperBlk*>(
             replacementPolicy->getVictim(superblock_entries));
diff --git a/src/mem/cache/tags/compressed_tags.hh b/src/mem/cache/tags/compressed_tags.hh
index 6e5b62d3e8..136a11fc5c 100644
--- a/src/mem/cache/tags/compressed_tags.hh
+++ b/src/mem/cache/tags/compressed_tags.hh
@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2023-2024 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 2018 Inria
  * All rights reserved.
  *
@@ -102,11 +114,13 @@ class CompressedTags : public SectorTags
      * @param is_secure True if the target memory space is secure.
      * @param compressed_size Size, in bits, of new block to allocate.
      * @param evict_blks Cache blocks to be evicted.
+     * @param partition_id Partition ID for resource management.
      * @return Cache block to be replaced.
      */
     CacheBlk* findVictim(Addr addr, const bool is_secure,
                          const std::size_t compressed_size,
-                         std::vector<CacheBlk*>& evict_blks) override;
+                         std::vector<CacheBlk*>& evict_blks,
+                         const uint64_t partition_id) override;
 
     /**
      * Find if any of the sub-blocks satisfies a condition.
diff --git a/src/mem/cache/tags/fa_lru.cc b/src/mem/cache/tags/fa_lru.cc
index 81fd478e59..626f419015 100644
--- a/src/mem/cache/tags/fa_lru.cc
+++ b/src/mem/cache/tags/fa_lru.cc
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2023-2024 ARM Limited
  * Copyright (c) 2018 Inria
  * Copyright (c) 2013,2016-2018 ARM Limited
  * All rights reserved.
@@ -74,6 +75,8 @@ FALRU::FALRU(const Params &p)
               blkSize);
     if (!isPowerOf2(size))
         fatal("Cache Size must be power of 2 for now");
+    if (partitionManager)
+        fatal("Cannot use Cache Partitioning Policies with FALRU");
 
     blks = new FALRUBlk[numBlocks];
 }
@@ -191,7 +194,8 @@ FALRU::findBlockBySetAndWay(int set, int way) const
 
 CacheBlk*
 FALRU::findVictim(Addr addr, const bool is_secure, const std::size_t size,
-                  std::vector<CacheBlk*>& evict_blks)
+                  std::vector<CacheBlk*>& evict_blks,
+                  const uint64_t partition_id)
 {
     // The victim is always stored on the tail for the FALRU
     FALRUBlk* victim = tail;
diff --git a/src/mem/cache/tags/fa_lru.hh b/src/mem/cache/tags/fa_lru.hh
index dba89f809d..9c49db4648 100644
--- a/src/mem/cache/tags/fa_lru.hh
+++ b/src/mem/cache/tags/fa_lru.hh
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2023-2024 ARM Limited
  * Copyright (c) 2012-2013,2016,2018 ARM Limited
  * All rights reserved.
  *
@@ -216,11 +217,13 @@ class FALRU : public BaseTags
      * @param is_secure True if the target memory space is secure.
      * @param size Size, in bits, of new block to allocate.
      * @param evict_blks Cache blocks to be evicted.
+     * @param partition_id Partition ID for resource management.
      * @return Cache block to be replaced.
      */
     CacheBlk* findVictim(Addr addr, const bool is_secure,
                          const std::size_t size,
-                         std::vector<CacheBlk*>& evict_blks) override;
+                         std::vector<CacheBlk*>& evict_blks,
+                         const uint64_t partition_id=0) override;
 
     /**
      * Insert the new block into the cache and update replacement data.
diff --git a/src/mem/cache/tags/partitioning_policies/PartitioningPolicies.py b/src/mem/cache/tags/partitioning_policies/PartitioningPolicies.py
new file mode 100644
index 0000000000..a441813f8c
--- /dev/null
+++ b/src/mem/cache/tags/partitioning_policies/PartitioningPolicies.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2024 Arm Limited
+# All rights reserved
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import (
+    Param,
+    VectorParam,
+)
+from m5.proxy import Parent
+from m5.SimObject import SimObject
+
+
+class PartitionManager(SimObject):
+    type = "PartitionManager"
+    cxx_header = "mem/cache/tags/partitioning_policies/partition_manager.hh"
+    cxx_class = "gem5::partitioning_policy::PartitionManager"
+
+    partitioning_policies = VectorParam.BasePartitioningPolicy(
+        [],
+        "Partitioning policies "
+        "Setting multiple policies will enforce all of them individually "
+        "in order",
+    )
+
+
+class BasePartitioningPolicy(SimObject):
+    type = "BasePartitioningPolicy"
+    cxx_header = "mem/cache/tags/partitioning_policies/base_pp.hh"
+    cxx_class = "gem5::partitioning_policy::BasePartitioningPolicy"
+    abstract = True
+
+
+class WayPolicyAllocation(SimObject):
+    type = "WayPolicyAllocation"
+    cxx_header = "mem/cache/tags/partitioning_policies/way_allocation.hh"
+    cxx_class = "gem5::partitioning_policy::WayPolicyAllocation"
+
+    partition_id = Param.UInt64(
+        "PartitionID to use the allocated ways" "Example: 0"
+    )
+
+    ways = VectorParam.UInt64(
+        "Ways to be allocated to the provided PartitionID"
+        "Format: [<way_num>,<way_num>, ...]"
+        "Example: [0, 1]"
+    )
+
+
+class WayPartitioningPolicy(BasePartitioningPolicy):
+    type = "WayPartitioningPolicy"
+    cxx_header = "mem/cache/tags/partitioning_policies/way_pp.hh"
+    cxx_class = "gem5::partitioning_policy::WayPartitioningPolicy"
+
+    cache_associativity = Param.Unsigned(Parent.assoc, "Associativity")
+
+    allocations = VectorParam.WayPolicyAllocation(
+        "Array of WayPolicyAllocation objects, used to determine what will be"
+        "done in each cache way"
+        "Format: [<WayPolicyAllocation>,<WayPolicyAllocation>, ...]"
+        "Example: ["
+        "   WayPolicyAllocation(0, [0,1,2,3]),"
+        "   WayPolicyAllocation(1, [4,5,6,7])"
+        "]"
+    )
+
+
+class MaxCapacityPartitioningPolicy(BasePartitioningPolicy):
+    type = "MaxCapacityPartitioningPolicy"
+    cxx_header = "mem/cache/tags/partitioning_policies/max_capacity_pp.hh"
+    cxx_class = "gem5::partitioning_policy::MaxCapacityPartitioningPolicy"
+
+    cache_size = Param.MemorySize(Parent.size, "Cache size in bytes")
+    blk_size = Param.Int(Parent.cache_line_size, "Cache block size in bytes")
+
+    partition_ids = VectorParam.UInt64(
+        "PartitionIDs assigned to this policy"
+        "Format: [<partition_id>,<partition_id>, ...]"
+        "Example: [0, 1]"
+    )
+
+    capacities = VectorParam.Float(
+        "Assigned max capacity in range [0,1] for PartitionIDs"
+        "(allocations rounded down to nearest cache block count)"
+        "Format: [<max_capacity>,<max_capacity>,...]"
+        "Example: [0.5, 0.75]"
+    )
diff --git a/src/mem/cache/tags/partitioning_policies/SConscript b/src/mem/cache/tags/partitioning_policies/SConscript
new file mode 100644
index 0000000000..901ef03569
--- /dev/null
+++ b/src/mem/cache/tags/partitioning_policies/SConscript
@@ -0,0 +1,52 @@
+# -*- mode:python -*-
+
+# Copyright (c) 2024 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+SimObject('PartitioningPolicies.py', sim_objects=[
+    'PartitionManager',
+    'BasePartitioningPolicy',
+    'MaxCapacityPartitioningPolicy',
+    'WayPolicyAllocation',
+    'WayPartitioningPolicy']
+    )
+
+Source('base_pp.cc')
+Source('max_capacity_pp.cc')
+Source('way_allocation.cc')
+Source('way_pp.cc')
+Source('partition_manager.cc')
diff --git a/src/mem/cache/tags/partitioning_policies/base_pp.cc b/src/mem/cache/tags/partitioning_policies/base_pp.cc
new file mode 100644
index 0000000000..45e67416a2
--- /dev/null
+++ b/src/mem/cache/tags/partitioning_policies/base_pp.cc
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2024 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mem/cache/tags/partitioning_policies/base_pp.hh"
+
+#include "params/BasePartitioningPolicy.hh"
+
+namespace gem5
+{
+
+namespace partitioning_policy
+{
+
+BasePartitioningPolicy::BasePartitioningPolicy
+    (const BasePartitioningPolicyParams &params): SimObject(params)
+{
+
+}
+
+} // namespace partitioning_policy
+
+} // namespace gem5
diff --git a/src/mem/cache/tags/partitioning_policies/base_pp.hh b/src/mem/cache/tags/partitioning_policies/base_pp.hh
new file mode 100644
index 0000000000..5ea160c335
--- /dev/null
+++ b/src/mem/cache/tags/partitioning_policies/base_pp.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2024 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __MEM_CACHE_TAGS_PARTITIONING_POLICIES_BASE_HH__
+#define __MEM_CACHE_TAGS_PARTITIONING_POLICIES_BASE_HH__
+
+#include <vector>
+
+#include "params/BasePartitioningPolicy.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+class ReplaceableEntry;
+
+namespace partitioning_policy
+{
+
+/**
+ * A Partitioning Policy is a cache partitioning mechanism that limits the
+ * cache block allocations in a cache based on a PartitionID identifier. This
+ * identifier may be set to any upstream memory request by attaching the
+ * PartitionID to it. The way the partition ID is attached/extracted
+ * from the request depends on the partitioning manager.
+ *
+ * See the use of the PartitionFieldExtension in Arm as an example.
+ *
+ * When partitioning policies are in place, the allocatable cache blocks for
+ * this memory request will be filtered based on its PartitionID.
+ *
+ */
+class BasePartitioningPolicy : public SimObject
+{
+  public:
+    BasePartitioningPolicy(const BasePartitioningPolicyParams &params);
+
+    /**
+    * Filters the allocatable cache blocks for a memory request based on its
+    * PartitionID and policy allocation
+    * @param entries candidate cache blocks for this request; filtered in place
+    * @param partition_id PartitionID of the upstream memory request
+    */
+    virtual void
+    filterByPartition(std::vector<ReplaceableEntry *> &entries,
+                      const uint64_t partition_id) const = 0;
+
+    /**
+    * Notify of acquisition of ownership of a cache line
+    * @param partition_id PartitionID of the upstream memory request
+    */
+    virtual void
+    notifyAcquire(const uint64_t partition_id) = 0;
+
+    /**
+    * Notify of release of ownership of a cache line
+    * @param partition_id PartitionID of the upstream memory request
+    */
+    virtual void
+    notifyRelease(const uint64_t partition_id) = 0;
+};
+
+} // namespace partitioning_policy
+
+} // namespace gem5
+
+#endif // __MEM_CACHE_TAGS_PARTITIONING_POLICIES_BASE_HH__
diff --git a/src/mem/cache/tags/partitioning_policies/max_capacity_pp.cc b/src/mem/cache/tags/partitioning_policies/max_capacity_pp.cc
new file mode 100644
index 0000000000..35ceb13a3b
--- /dev/null
+++ b/src/mem/cache/tags/partitioning_policies/max_capacity_pp.cc
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2024 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mem/cache/tags/partitioning_policies/max_capacity_pp.hh"
+
+#include <algorithm>
+#include <string>
+
+#include "base/logging.hh"
+#include "base/trace.hh"
+#include "params/MaxCapacityPartitioningPolicy.hh"
+
+namespace gem5
+{
+
+namespace partitioning_policy
+{
+
+MaxCapacityPartitioningPolicy::MaxCapacityPartitioningPolicy
+    (const MaxCapacityPartitioningPolicyParams &params):
+    BasePartitioningPolicy(params), cacheSize(params.cache_size),
+    blkSize(params.blk_size), partitionIDs(params.partition_ids),
+    capacities(params.capacities)
+{
+    // check if ids and capacities vectors are the same length
+    if (this->partitionIDs.size() != this->capacities.size()) {
+        fatal("MaxCapacity Partitioning Policy configuration invalid: ids and "
+            "capacities arrays are not equal lengths");
+    }
+
+    // calculate total cache block count to use when creating allocation maps
+    const uint64_t total_block_cnt = this->cacheSize / this->blkSize;
+
+    // check allocations and create map
+    for (auto i = 0; i < this->partitionIDs.size(); i++) {
+        const uint64_t partition_id = this->partitionIDs[i];
+        const double cap_frac = capacities[i];
+
+        // check Capacity Fraction (cap_frac) is actually a fraction in [0,1]
+        if (!(cap_frac >= 0 && cap_frac <= 1)) {
+            fatal("MaxCapacity Partitioning Policy for PartitionID %d has "
+                "Capacity Fraction %f outside of [0,1] range", partition_id,
+                cap_frac);
+        }
+
+        const uint64_t allocated_block_cnt = cap_frac * total_block_cnt;
+        partitionIdMaxCapacity.emplace(partition_id, allocated_block_cnt);
+
+        DPRINTF(PartitionPolicy, "Configured MaxCapacity Partitioning Policy "
+            "for PartitionID: %d to use portion of size %f (%d cache blocks "
+            "of %d total)\n", partition_id, cap_frac, allocated_block_cnt,
+            total_block_cnt);
+
+    }
+}
+
+void
+MaxCapacityPartitioningPolicy::filterByPartition(
+    std::vector<ReplaceableEntry *> &entries,
+    const uint64_t id) const
+{
+    if (// No entries to filter
+        entries.empty() ||
+        // This partition_id is not policed
+        partitionIdMaxCapacity.find(id) == partitionIdMaxCapacity.end() ||
+        // This partition_id has not yet used the cache
+        partitionIdCurCapacity.find(id) == partitionIdCurCapacity.end() ||
+        // The partition_id usage is below the maximum
+        partitionIdCurCapacity.at(id) < partitionIdMaxCapacity.at(id))
+        return;
+
+    // Limit reached, restrict allocation only to blocks owned by
+    // the Partition ID
+    entries.erase(std::remove_if(entries.begin(), entries.end(),
+        [id](ReplaceableEntry *entry) {
+            CacheBlk *blk = static_cast<CacheBlk *>(entry);
+            return blk->getPartitionId() != id;
+        }), entries.end());
+}
+
+void
+MaxCapacityPartitioningPolicy::notifyAcquire(const uint64_t partition_id)
+{
+    // sanity check current allocation does not exceed its configured maximum
+    assert(partitionIdCurCapacity[partition_id] <=
+        partitionIdMaxCapacity[partition_id]);
+
+    partitionIdCurCapacity[partition_id] += 1;
+}
+
+void
+MaxCapacityPartitioningPolicy::notifyRelease(const uint64_t partition_id)
+{
+    // sanity check current allocation will not cause underflow
+    assert(partitionIdCurCapacity[partition_id] > 0);
+
+    partitionIdCurCapacity[partition_id] -= 1;
+}
+
+} // namespace partitioning_policy
+
+} // namespace gem5
diff --git a/src/mem/cache/tags/partitioning_policies/max_capacity_pp.hh b/src/mem/cache/tags/partitioning_policies/max_capacity_pp.hh
new file mode 100644
index 0000000000..ba4dadebe5
--- /dev/null
+++ b/src/mem/cache/tags/partitioning_policies/max_capacity_pp.hh
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2024 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __MEM_CACHE_TAGS_PARTITIONING_POLICIES_MAX_CAPACITY_HH__
+#define __MEM_CACHE_TAGS_PARTITIONING_POLICIES_MAX_CAPACITY_HH__
+
+#include <unordered_map>
+#include <vector>
+
+#include "debug/PartitionPolicy.hh"
+#include "mem/cache/cache_blk.hh"
+#include "mem/cache/tags/partitioning_policies/base_pp.hh"
+#include "params/BasePartitioningPolicy.hh"
+#include "params/MaxCapacityPartitioningPolicy.hh"
+
+namespace gem5
+{
+
+namespace partitioning_policy
+{
+
+/**
+ * A MaxCapacityPartitioningPolicy filters the cache blocks available to a
+ * memory requestor (identified via PartitionID) based on count of already
+ * allocated blocks. The number of cache blocks a specific memory requestor
+ * can have access to is determined by its provided capacities allocation in
+ * the [0, 1] range. This policy has no effect on requests with unregistered
+ * PartitionIDs.
+ *
+ * @see BasePartitioningPolicy
+ */
+class MaxCapacityPartitioningPolicy : public BasePartitioningPolicy
+{
+  public:
+    MaxCapacityPartitioningPolicy
+    (const MaxCapacityPartitioningPolicyParams &params);
+
+    void
+    filterByPartition(std::vector<ReplaceableEntry *> &entries,
+                      const uint64_t partition_id) const override;
+
+    void
+    notifyAcquire(const uint64_t partition_id) override;
+
+    void
+    notifyRelease(const uint64_t partition_id) override;
+
+  private:
+    /**
+    * Cache size in number of bytes
+    */
+    const uint64_t cacheSize;
+
+    /**
+    * Cache block size in number of bytes
+    */
+    const uint64_t blkSize;
+
+    /**
+    * Vector of partitionIDs the policy operates on
+    */
+    const std::vector< uint64_t > partitionIDs;
+
+    /**
+    * Vector of capacity fractions to enforce on the policied partitionIDs
+    */
+    const std::vector< double > capacities;
+
+    /**
+    * Map of PartitionIDs and maximum allocatable cache block counts;
+    * On evictions full partitions are prioritized.
+    */
+    std::unordered_map< uint64_t, uint64_t > partitionIdMaxCapacity;
+
+    /**
+    * Map of PartitionIDs and currently allocated blck coutns
+    */
+    std::unordered_map< uint64_t, uint64_t > partitionIdCurCapacity;
+};
+
+} // namespace partitioning_policy
+
+} // namespace gem5
+
+#endif // __MEM_CACHE_TAGS_PARTITIONING_POLICIES_MAX_CAPACITY_HH__
diff --git a/src/dev/virtio/rng 2.cc b/src/mem/cache/tags/partitioning_policies/partition_manager.cc
similarity index 63%
rename from src/dev/virtio/rng 2.cc
rename to src/mem/cache/tags/partitioning_policies/partition_manager.cc
index c26568eb94..109a43bba7 100644
--- a/src/dev/virtio/rng 2.cc	
+++ b/src/mem/cache/tags/partitioning_policies/partition_manager.cc
@@ -1,6 +1,5 @@
 /*
- * Copyright (c) 2022  Institute of Computing Technology, Chinese Academy
- *                     of Sciences
+ * Copyright (c) 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -36,59 +35,51 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "dev/virtio/rng.hh"
+#include "mem/cache/tags/partitioning_policies/partition_manager.hh"
 
-#include "base/random.hh"
-#include "debug/VIORng.hh"
-#include "params/VirtIORng.hh"
-#include "sim/system.hh"
+#include "mem/cache/tags/partitioning_policies/base_pp.hh"
 
 namespace gem5
 {
 
-VirtIORng::VirtIORng(const Params &params)
-    : VirtIODeviceBase(params, ID_RNG, 0, 0),
-      qReq(params.system->physProxy, byteOrder, params.qSize, *this)
+namespace partitioning_policy
 {
-    registerQueue(qReq);
-}
 
-VirtIORng::~VirtIORng()
-{
-}
-
-VirtIORng::RngQueue::RngQueue(PortProxy &proxy, ByteOrder bo, uint16_t size,
-    VirtIORng &_parent)
-    : VirtQueue(proxy, bo, size), parent(_parent)
-{
-}
+PartitionManager::PartitionManager(const Params &p)
+  : SimObject(p),
+    partitioningPolicies(p.partitioning_policies)
+{}
 
 void
-VirtIORng::readConfig(PacketPtr pkt, Addr cfgOffset)
+PartitionManager::notifyAcquire(uint64_t partition_id)
 {
-    // There are no configuration for RNG device
-    pkt->makeResponse();
-}
-
-void
-VirtIORng::RngQueue::trySend()
-{
-    DPRINTF(VIORng, "try send\n");
-
-    VirtDescriptor *d;
-    while ((d = consumeDescriptor())) {
-        DPRINTF(VIORng, "Got descriptor (len: %i)\n", d->size());
-        size_t len = 0;
-        while (len < d->size()) {
-            uint8_t byte = gem5::random_mt.random<uint8_t>();
-            d->chainWrite(len, &byte, sizeof(uint8_t));
-            ++len;
-        }
-
-        // Tell the guest that we are done with this descriptor.
-        produceDescriptor(d, len);
-        parent.kick();
+    // Notify partitioning policies of acquisition of ownership
+    for (auto & partitioning_policy : partitioningPolicies) {
+        // get partitionId from Packet
+        partitioning_policy->notifyAcquire(partition_id);
     }
 }
 
+void
+PartitionManager::notifyRelease(uint64_t partition_id)
+{
+    // Notify partitioning policies of release of ownership
+    for (auto partitioning_policy : partitioningPolicies) {
+        partitioning_policy->notifyRelease(partition_id);
+    }
+}
+
+void
+PartitionManager::filterByPartition(
+    std::vector<ReplaceableEntry *> &entries,
+    const uint64_t partition_id) const
+{
+    // Filter entries based on PartitionID
+    for (auto partitioning_policy : partitioningPolicies) {
+        partitioning_policy->filterByPartition(entries, partition_id);
+    }
+}
+
+} // namespace partitioning_policy
+
 } // namespace gem5
diff --git a/src/dev/virtio/rng 2.hh b/src/mem/cache/tags/partitioning_policies/partition_manager.hh
similarity index 62%
rename from src/dev/virtio/rng 2.hh
rename to src/mem/cache/tags/partitioning_policies/partition_manager.hh
index 7be235421a..ca9436a2cf 100644
--- a/src/dev/virtio/rng 2.hh	
+++ b/src/mem/cache/tags/partitioning_policies/partition_manager.hh
@@ -1,6 +1,5 @@
 /*
- * Copyright (c) 2022  Institute of Computing Technology, Chinese Academy
- *                     of Sciences
+ * Copyright (c) 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -36,62 +35,61 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DEV_VIRTIO_RNG_HH__
-#define __DEV_VIRTIO_RNG_HH__
+#ifndef __MEM_CACHE_TAGS_PARTITIONING_MANAGER_HH__
+#define __MEM_CACHE_TAGS_PARTITIONING_MANAGER_HH__
 
-#include "base/compiler.hh"
-#include "dev/virtio/base.hh"
+#include "mem/packet.hh"
+#include "params/PartitionManager.hh"
+#include "sim/sim_object.hh"
 
 namespace gem5
 {
 
-struct VirtIORngParams;
+class ReplaceableEntry;
 
-/**
- * VirtIO Rng
- *
- * @see https://github.com/rustyrussell/virtio-spec
- * @see http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html
- */
-class VirtIORng : public VirtIODeviceBase
+namespace partitioning_policy
+{
+
+class BasePartitioningPolicy;
+
+class PartitionManager : public SimObject
 {
   public:
-    typedef VirtIORngParams Params;
-    VirtIORng(const Params &params);
-    virtual ~VirtIORng();
+    PARAMS(PartitionManager);
+    PartitionManager(const Params &p);
 
-    void readConfig(PacketPtr pkt, Addr cfgOffset);
-
-  protected:
-    /** VirtIO device ID */
-    static const DeviceId ID_RNG = 0x04;
-
-  protected:
     /**
-     * Virtqueue for data going from the host to the guest.
-     */
-    class RngQueue
-        : public VirtQueue
+    * PartitionManager interface to retrieve PartitionID from a packet;
+    * This base implementation returns zero by default.
+    *
+    * @param pkt pointer to packet (PacketPtr)
+    * @return packet PartitionID.
+    */
+    virtual uint64_t
+    readPacketPartitionID(PacketPtr pkt) const
     {
-      public:
-        RngQueue(PortProxy &proxy, ByteOrder bo, uint16_t size,
-                 VirtIORng &_parent);
-        virtual ~RngQueue() {}
-
-        void onNotify() { trySend(); }
-
-        /** Try to send data pending data from the terminal. */
-        void trySend();
-
-        std::string name() const { return parent.name() + ".qRecv"; }
-
-      protected:
-        VirtIORng &parent;
+        return 0;
     };
-    /** Receive queue for port 0 */
-    RngQueue qReq;
+
+    /**
+    * Notify of acquisition of ownership of a cache line
+    * @param partition_id PartitionID of the upstream memory request
+    */
+    void notifyAcquire(uint64_t partition_id);
+
+    void notifyRelease(uint64_t partition_id);
+
+    void filterByPartition(std::vector<ReplaceableEntry *> &entries,
+        const uint64_t partition_id) const;
+
+  protected:
+    /** Partitioning policies */
+    std::vector<partitioning_policy::BasePartitioningPolicy *>
+        partitioningPolicies;
 };
 
+} // namespace partitioning_policy
+
 } // namespace gem5
 
-#endif // __DEV_VIRTIO_RNG_HH__
+#endif // __MEM_CACHE_TAGS_PARTITIONING_MANAGER_HH__
diff --git a/src/mem/cache/tags/partitioning_policies/way_allocation.cc b/src/mem/cache/tags/partitioning_policies/way_allocation.cc
new file mode 100644
index 0000000000..ebb5d89492
--- /dev/null
+++ b/src/mem/cache/tags/partitioning_policies/way_allocation.cc
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2024 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mem/cache/tags/partitioning_policies/way_allocation.hh"
+
+#include "params/WayPolicyAllocation.hh"
+
+namespace gem5
+{
+
+namespace partitioning_policy
+{
+
+WayPolicyAllocation::WayPolicyAllocation
+    (const WayPolicyAllocationParams &params): SimObject(params),
+    _ways(params.ways),_partitionId(params.partition_id)
+{
+
+}
+
+std::vector< uint64_t >
+WayPolicyAllocation::getWays() const
+{
+    return this->_ways;
+}
+
+uint64_t
+WayPolicyAllocation::getPartitionId() const
+{
+    return this->_partitionId;
+}
+
+} // namespace partitioning_policy
+
+} // namespace gem5
diff --git a/src/mem/cache/tags/partitioning_policies/way_allocation.hh b/src/mem/cache/tags/partitioning_policies/way_allocation.hh
new file mode 100644
index 0000000000..b1110af50b
--- /dev/null
+++ b/src/mem/cache/tags/partitioning_policies/way_allocation.hh
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2024 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __MEM_CACHE_TAGS_PARTITIONING_POLICIES_WAY_ALLOCATION_HH__
+#define __MEM_CACHE_TAGS_PARTITIONING_POLICIES_WAY_ALLOCATION_HH__
+
+#include <vector>
+
+#include "params/WayPolicyAllocation.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+class ReplaceableEntry;
+
+namespace partitioning_policy
+{
+
+/**
+ * A WayPolicyAllocation holds a single PartitionID->Ways allocation for Way
+ * Partitioning Policies.
+ *
+ * @see WayPartitioningPolicy
+ */
+class WayPolicyAllocation : public SimObject
+{
+  public:
+    WayPolicyAllocation(const WayPolicyAllocationParams &params);
+
+    /**
+    * Way Policy Allocation _ways getter
+    * @return Allocation ways
+    */
+    std::vector< uint64_t > getWays() const;
+
+    /**
+    * Way Policy Allocation _partitionId getter
+    * @return Allocation Partition ID
+    */
+    uint64_t getPartitionId() const;
+
+  private:
+    /**
+    * Vector of ways to allocated to the PartitionID
+    */
+    const std::vector< uint64_t > _ways;
+
+    /**
+    * PartitionID on which allocation should be enforced
+    */
+    const uint64_t _partitionId;
+};
+
+} // namespace partitioning_policy
+
+} // namespace gem5
+
+#endif // __MEM_CACHE_TAGS_PARTITIONING_POLICIES_WAY_ALLOCATION_HH__
diff --git a/src/mem/cache/tags/partitioning_policies/way_pp.cc b/src/mem/cache/tags/partitioning_policies/way_pp.cc
new file mode 100644
index 0000000000..ab68b934ff
--- /dev/null
+++ b/src/mem/cache/tags/partitioning_policies/way_pp.cc
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2024 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mem/cache/tags/partitioning_policies/way_pp.hh"
+
+#include <algorithm>
+
+#include "base/logging.hh"
+#include "base/trace.hh"
+#include "params/WayPartitioningPolicy.hh"
+#include "way_allocation.hh"
+
+namespace gem5
+{
+
+namespace partitioning_policy
+{
+
+WayPartitioningPolicy::WayPartitioningPolicy
+    (const WayPartitioningPolicyParams &params): BasePartitioningPolicy(params)
+{
+    // get cache associativity and check it is usable for this policy
+    const auto cache_assoc = params.cache_associativity;
+    assert(cache_assoc > 0);
+
+    // iterate over all provided allocations
+    for (const auto allocation: params.allocations) {
+        const auto alloc_id = allocation->getPartitionId();
+
+        // save way allocations in policy
+        for (const auto way: allocation->getWays()) {
+
+            // check if allocations are valid
+            fatal_if(way >= cache_assoc, "Way Partitioning Policy allocation "
+                "for PartitionID: %d, Way: %d cannot be fullfiled as cache "
+                "associativity is %d", alloc_id, way, cache_assoc);
+
+            if (this->partitionIdWays[alloc_id].count(way) == 0) {
+                this->partitionIdWays[alloc_id].emplace(way);
+            } else {
+                // do not add duplicate allocation to policy and warn
+                warn("Duplicate Way Partitioning Policy allocation for "
+                    "PartitionID: %d, Way: %d",
+                    alloc_id, way);
+            }
+        }
+
+        // report allocation of policies
+        DPRINTF(PartitionPolicy, "Allocated %d ways in WayPartitioningPolicy "
+            "for PartitionID: %d \n", allocation->getWays().size(),
+            alloc_id);
+    }
+}
+
+void
+WayPartitioningPolicy::filterByPartition(
+    std::vector<ReplaceableEntry *> &entries,
+    const uint64_t partition_id) const
+{
+    if (// No entries to filter
+        entries.empty() ||
+        // This partition_id is not policed
+        partitionIdWays.find(partition_id) == partitionIdWays.end()) {
+        return;
+    } else {
+        const auto entries_to_remove = std::remove_if(
+            entries.begin(),
+            entries.end(),
+            [this, partition_id]
+            (ReplaceableEntry *entry)
+            {
+                return partitionIdWays.at(partition_id).find(entry->getWay())
+                    == partitionIdWays.at(partition_id).end();
+            }
+        );
+
+        entries.erase(entries_to_remove, entries.end());
+    }
+}
+
+} // namespace partitioning_policy
+
+} // namespace gem5
diff --git a/src/mem/cache/tags/partitioning_policies/way_pp.hh b/src/mem/cache/tags/partitioning_policies/way_pp.hh
new file mode 100644
index 0000000000..ef623689a5
--- /dev/null
+++ b/src/mem/cache/tags/partitioning_policies/way_pp.hh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __MEM_CACHE_TAGS_PARTITIONING_POLICIES_WAY_HH__
+#define __MEM_CACHE_TAGS_PARTITIONING_POLICIES_WAY_HH__
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "debug/PartitionPolicy.hh"
+#include "mem/cache/replacement_policies/replaceable_entry.hh"
+#include "mem/cache/tags/partitioning_policies/base_pp.hh"
+#include "params/WayPartitioningPolicy.hh"
+
+namespace gem5
+{
+
+namespace partitioning_policy
+{
+
+/**
+ * A WayPartitioningPolicy filters the cache blocks available to a memory
+ * requestor (identified via PartitionID) based on the cache ways allocated to
+ * that requestor. This policy has no effect on requests with unregistered
+ * PartitionIDs.
+ *
+ * @see BasePartitioningPolicy
+ */
+class WayPartitioningPolicy : public BasePartitioningPolicy
+{
+  public:
+    WayPartitioningPolicy(const WayPartitioningPolicyParams &params);
+
+    void
+    filterByPartition(std::vector<ReplaceableEntry *> &entries,
+                        const uint64_t partition_id) const override;
+
+    /**
+    * Empty implementation as block allocations do not vary with number of
+    * allocated blocks for this policy
+    * @param partition_id PartitionID of the upstream memory request
+    */
+    void
+    notifyAcquire(const uint64_t partition_id) override {};
+
+    /**
+    * Empty implementation as block allocations do not vary with number of
+    * allocated blocks for this policy
+    * @param partition_id PartitionID of the upstream memory request
+    */
+    void
+    notifyRelease(const uint64_t partition_id) override {};
+
+  private:
+    /**
+    * Map of policied PartitionIDs and their associated cache ways
+    */
+    std::unordered_map< uint64_t, std::unordered_set< unsigned > >
+        partitionIdWays;
+};
+
+} // namespace partitioning_policy
+
+} // namespace gem5
+
+#endif // __MEM_CACHE_TAGS_PARTITIONING_POLICIES_WAY_HH__
diff --git a/src/mem/cache/tags/sector_tags.cc b/src/mem/cache/tags/sector_tags.cc
index 6a9ffd02ed..4f0b39286c 100644
--- a/src/mem/cache/tags/sector_tags.cc
+++ b/src/mem/cache/tags/sector_tags.cc
@@ -1,4 +1,15 @@
 /*
+ * Copyright (c) 2024 ARM Limited
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 2018, 2020 Inria
  * All rights reserved.
  *
@@ -44,6 +55,7 @@
 #include "mem/cache/replacement_policies/base.hh"
 #include "mem/cache/replacement_policies/replaceable_entry.hh"
 #include "mem/cache/tags/indexing_policies/base.hh"
+#include "mem/cache/tags/partitioning_policies/partition_manager.hh"
 
 namespace gem5
 {
@@ -65,6 +77,9 @@ SectorTags::SectorTags(const SectorTagsParams &p)
              "Block size must be at least 4 and a power of 2");
     fatal_if(!isPowerOf2(numBlocksPerSector),
              "# of blocks per sector must be non-zero and a power of 2");
+    warn_if(partitionManager,
+             "Using cache partitioning policies with sector and/or compressed "
+             "tags is not fully tested.");
 }
 
 void
@@ -274,12 +289,17 @@ SectorTags::findBlock(Addr addr, bool is_secure) const
 
 CacheBlk*
 SectorTags::findVictim(Addr addr, const bool is_secure, const std::size_t size,
-                       std::vector<CacheBlk*>& evict_blks)
+                       std::vector<CacheBlk*>& evict_blks,
+                       const uint64_t partition_id)
 {
     // Get possible entries to be victimized
-    const std::vector<ReplaceableEntry*> sector_entries =
+    std::vector<ReplaceableEntry*> sector_entries =
         indexingPolicy->getPossibleEntries(addr);
 
+    // Filter entries based on PartitionID
+    if (partitionManager)
+        partitionManager->filterByPartition(sector_entries, partition_id);
+
     // Check if the sector this address belongs to has been allocated
     Addr tag = extractTag(addr);
     SectorBlk* victim_sector = nullptr;
@@ -293,6 +313,12 @@ SectorTags::findVictim(Addr addr, const bool is_secure, const std::size_t size,
 
     // If the sector is not present
     if (victim_sector == nullptr){
+        // check if partitioning policy limited allocation and if true - return
+        // this assumes that sector_entries would not be empty if partitioning
+        // policy is not in place
+        if (sector_entries.size() == 0){
+            return nullptr;
+        }
         // Choose replacement victim from replacement candidates
         victim_sector = static_cast<SectorBlk*>(replacementPolicy->getVictim(
                                                 sector_entries));
diff --git a/src/mem/cache/tags/sector_tags.hh b/src/mem/cache/tags/sector_tags.hh
index 035b085962..87ec3c1cea 100644
--- a/src/mem/cache/tags/sector_tags.hh
+++ b/src/mem/cache/tags/sector_tags.hh
@@ -1,4 +1,15 @@
 /*
+ * Copyright (c) 2024 ARM Limited
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 2018 Inria
  * All rights reserved.
  *
@@ -171,11 +182,13 @@ class SectorTags : public BaseTags
      * @param is_secure True if the target memory space is secure.
      * @param size Size, in bits, of new block to allocate.
      * @param evict_blks Cache blocks to be evicted.
+     * @param partition_id Partition ID for resource management.
      * @return Cache block to be replaced.
      */
     CacheBlk* findVictim(Addr addr, const bool is_secure,
                          const std::size_t size,
-                         std::vector<CacheBlk*>& evict_blks) override;
+                         std::vector<CacheBlk*>& evict_blks,
+                         const uint64_t partition_id) override;
 
     /**
      * Calculate a block's offset in a sector from the address.
diff --git a/src/mem/cache/tags/tagged_entry.hh b/src/mem/cache/tags/tagged_entry.hh
index f6166f6fbe..8ff12d7e20 100644
--- a/src/mem/cache/tags/tagged_entry.hh
+++ b/src/mem/cache/tags/tagged_entry.hh
@@ -31,7 +31,9 @@
 
 #include <cassert>
 
+#include "base/cache/cache_entry.hh"
 #include "base/cprintf.hh"
+#include "base/logging.hh"
 #include "base/types.hh"
 #include "mem/cache/replacement_policies/replaceable_entry.hh"
 
@@ -43,19 +45,12 @@ namespace gem5
  * secure bit, which informs whether it belongs to a secure address space.
  * A tagged entry's contents are only relevant if it is marked as valid.
  */
-class TaggedEntry : public ReplaceableEntry
+class TaggedEntry : public CacheEntry
 {
   public:
-    TaggedEntry() : _valid(false), _secure(false), _tag(MaxAddr) {}
+    TaggedEntry() : CacheEntry(), _secure(false) {}
     ~TaggedEntry() = default;
 
-    /**
-     * Checks if the entry is valid.
-     *
-     * @return True if the entry is valid.
-     */
-    virtual bool isValid() const { return _valid; }
-
     /**
      * Check if this block holds data from the secure memory space.
      *
@@ -63,13 +58,6 @@ class TaggedEntry : public ReplaceableEntry
      */
     bool isSecure() const { return _secure; }
 
-    /**
-     * Get tag associated to this block.
-     *
-     * @return The tag value.
-     */
-    virtual Addr getTag() const { return _tag; }
-
     /**
      * Checks if the given tag information corresponds to this entry's.
      *
@@ -100,10 +88,10 @@ class TaggedEntry : public ReplaceableEntry
     }
 
     /** Invalidate the block. Its contents are no longer valid. */
-    virtual void invalidate()
+    void
+    invalidate() override
     {
-        _valid = false;
-        setTag(MaxAddr);
+        CacheEntry::invalidate();
         clearSecure();
     }
 
@@ -114,44 +102,36 @@ class TaggedEntry : public ReplaceableEntry
             isSecure(), isValid(), ReplaceableEntry::print());
     }
 
-  protected:
-    /**
-     * Set tag associated to this block.
-     *
-     * @param tag The tag value.
-     */
-    virtual void setTag(Addr tag) { _tag = tag; }
+    bool
+    matchTag(const Addr tag) const override
+    {
+        panic("Need is_secure arg");
+        return false;
+    }
 
+    void
+    insert(const Addr tag) override
+    {
+        panic("Need is_secure arg");
+        return;
+    }
+  protected:
     /** Set secure bit. */
     virtual void setSecure() { _secure = true; }
 
-    /** Set valid bit. The block must be invalid beforehand. */
-    virtual void
-    setValid()
-    {
-        assert(!isValid());
-        _valid = true;
-    }
-
   private:
-    /**
-     * Valid bit. The contents of this entry are only valid if this bit is set.
-     * @sa invalidate()
-     * @sa insert()
-     */
-    bool _valid;
-
     /**
      * Secure bit. Marks whether this entry refers to an address in the secure
      * memory space. Must always be modified along with the tag.
      */
     bool _secure;
 
-    /** The entry's tag. */
-    Addr _tag;
-
     /** Clear secure bit. Should be only used by the invalidation function. */
     void clearSecure() { _secure = false; }
+
+    /** Do not use API without is_secure flag. */
+    using CacheEntry::matchTag;
+    using CacheEntry::insert;
 };
 
 } // namespace gem5
diff --git a/src/mem/packet_queue.cc b/src/mem/packet_queue.cc
index 535764fbc8..d4a8cc33a3 100644
--- a/src/mem/packet_queue.cc
+++ b/src/mem/packet_queue.cc
@@ -118,8 +118,8 @@ PacketQueue::schedSendTiming(PacketPtr pkt, Tick when)
 
     // add a very basic sanity check on the port to ensure the
     // invisible buffer is not growing beyond reasonable limits
-    if (!_disableSanityCheck && transmitList.size() > 128) {
-        panic("Packet queue %s has grown beyond 128 packets\n",
+    if (!_disableSanityCheck && transmitList.size() > 1024) {
+        panic("Packet queue %s has grown beyond 1024 packets\n",
               name());
     }
 
diff --git a/src/mem/port_wrapper.cc b/src/mem/port_wrapper.cc
index 3b61fb2e97..8333900fa2 100644
--- a/src/mem/port_wrapper.cc
+++ b/src/mem/port_wrapper.cc
@@ -120,7 +120,7 @@ void
 ResponsePortWrapper::recvFunctional(PacketPtr packet)
 {
     panic_if(!recvFunctionalCb, "RecvFunctionalCallback is empty.");
-    recvTimingReqCb(packet);
+    recvFunctionalCb(packet);
 }
 
 void
diff --git a/src/mem/probes/stack_dist.cc b/src/mem/probes/stack_dist.cc
index ca78dffc82..50e98ecbc4 100644
--- a/src/mem/probes/stack_dist.cc
+++ b/src/mem/probes/stack_dist.cc
@@ -53,7 +53,7 @@ StackDistProbe::StackDistProbe(const StackDistProbeParams &p)
 {
     fatal_if(p.system->cacheLineSize() > p.line_size,
              "The stack distance probe must use a cache line size that is "
-             "larger or equal to the system's cahce line size.");
+             "larger or equal to the system's cache line size.");
 }
 
 StackDistProbe::StackDistProbeStats::StackDistProbeStats(
diff --git a/src/mem/qos/QoSMemSinkCtrl.py b/src/mem/qos/QoSMemSinkCtrl.py
index 8eed316d28..3aefc9b265 100644
--- a/src/mem/qos/QoSMemSinkCtrl.py
+++ b/src/mem/qos/QoSMemSinkCtrl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020 ARM Limited
+# Copyright (c) 2018-2020,2024 ARM Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -36,7 +36,6 @@
 # Author: Matteo Andreozzi
 
 from m5.objects.QoSMemCtrl import *
-from m5.objects.QoSMemSinkInterface import *
 from m5.params import *
 
 
@@ -46,9 +45,7 @@ class QoSMemSinkCtrl(QoSMemCtrl):
     cxx_class = "gem5::memory::qos::MemSinkCtrl"
     port = ResponsePort("Response ports")
 
-    interface = Param.QoSMemSinkInterface(
-        QoSMemSinkInterface(), "Interface to memory"
-    )
+    interface = Param.QoSMemSinkInterface("Interface to memory")
 
     # the basic configuration of the controller architecture, note
     # that each entry corresponds to a burst for the specific DRAM
diff --git a/src/mem/qos/QoSMemSinkInterface.py b/src/mem/qos/QoSMemSinkInterface.py
index 34ad5d5d14..008ba1c3ea 100644
--- a/src/mem/qos/QoSMemSinkInterface.py
+++ b/src/mem/qos/QoSMemSinkInterface.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021 Arm Limited
+# Copyright (c) 2020-2021,2024 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -34,6 +34,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.objects.AbstractMemory import AbstractMemory
+from m5.objects.QoSMemSinkCtrl import QoSMemSinkCtrl
 
 
 class QoSMemSinkInterface(AbstractMemory):
diff --git a/src/mem/qos/q_policy.cc b/src/mem/qos/q_policy.cc
index a6d13feb7e..0a210f3c71 100644
--- a/src/mem/qos/q_policy.cc
+++ b/src/mem/qos/q_policy.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited
+ * Copyright (c) 2018,2024 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -69,9 +69,27 @@ QueuePolicy::create(const QoSMemCtrlParams &p)
     }
 }
 
+QueuePolicy::PacketQueue::iterator
+FifoQueuePolicy::selectPacket(PacketQueue* queue)
+{
+    panic_if(queue->empty(),
+             "Provided packet queue is not usable by queue policy");
+    return queue->begin();
+}
+
+QueuePolicy::PacketQueue::iterator
+LifoQueuePolicy::selectPacket(PacketQueue* queue)
+{
+    panic_if(queue->empty(),
+             "Provided packet queue is not usable by queue policy");
+    return std::prev(queue->end());
+}
+
 QueuePolicy::PacketQueue::iterator
 LrgQueuePolicy::selectPacket(PacketQueue* q)
 {
+    panic_if(q->empty(),
+             "Provided packet queue is not usable by queue policy");
     QueuePolicy::PacketQueue::iterator ret = q->end();
 
     // Tracks one packet per requestor in the queue
@@ -91,10 +109,10 @@ LrgQueuePolicy::selectPacket(PacketQueue* q)
                      "from queue with id %d\n", requestor_id);
 
         // Check if this is a known requestor.
-        panic_if(memCtrl->hasRequestor(requestor_id),
+        panic_if(!memCtrl->hasRequestor(requestor_id),
                  "%s: Unrecognized Requestor\n", __func__);
 
-        panic_if(toServe.size() > 0,
+        panic_if(toServe.size() <= 0,
                  "%s: toServe list is empty\n", __func__);
 
         if (toServe.front() == requestor_id) {
@@ -137,8 +155,7 @@ LrgQueuePolicy::selectPacket(PacketQueue* q)
 
     DPRINTF(QOS, "QoSQPolicy::lrg no packet was serviced\n");
 
-    // Ret will be : packet to serve if any found or queue begin
-    // (end if queue is empty)
+    // Ret will be : packet to serve
     return ret;
 }
 
diff --git a/src/mem/qos/q_policy.hh b/src/mem/qos/q_policy.hh
index fc9200d0af..39fd2d9bb4 100644
--- a/src/mem/qos/q_policy.hh
+++ b/src/mem/qos/q_policy.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited
+ * Copyright (c) 2018, 2024 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -93,7 +93,7 @@ class QueuePolicy
      * The implementation of this virtual method selects the packet
      * to be serviced from the packet queue passed as an argument.
      *
-     * @param queue Packet queue
+     * @param queue Non-empty packet queue to select a packet from
      * @return Iterator pointing to the packet in the queue to be
      *         serviced
      */
@@ -127,14 +127,11 @@ class LifoQueuePolicy : public QueuePolicy
     /**
      * Implements LIFO packet select policy
      *
-     * @param queue The queue in which to select a packet
+     * @param queue The non-empty queue from which to select a packet
      * @return Iterator to the selected packet
      */
     PacketQueue::iterator
-    selectPacket(PacketQueue* queue) override
-    {
-        return queue->end();
-    }
+    selectPacket(PacketQueue* queue) override;
 };
 
 /** First In First Out Queue Policy */
@@ -148,14 +145,11 @@ class FifoQueuePolicy : public QueuePolicy
     /**
      * Implements FCFS packet select policy
      *
-     * @param queue The queue in which to select a packet
+     * @param queue The non-empty queue from which to select a packet
      * @return Iterator to the selected packet
      */
     PacketQueue::iterator
-    selectPacket(PacketQueue* queue) override
-    {
-        return queue->begin();
-    }
+    selectPacket(PacketQueue* queue) override;
 };
 
 /**
@@ -176,7 +170,7 @@ class LrgQueuePolicy : public QueuePolicy
     /**
      * Implements LRG packet select policy
      *
-     * @param queue The queue in which to select a packet
+     * @param queue The non-empty queue from which to select a packet
      * @return Iterator to the selected packet
      */
     PacketQueue::iterator
diff --git a/src/mem/request.hh b/src/mem/request.hh
index df249ac249..80bd4c817a 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -158,7 +158,7 @@ class Request : public Extensible<Request>
         MEM_SWAP                    = 0x00400000,
         MEM_SWAP_COND               = 0x00800000,
         /** This request is a read which will be followed by a write. */
-        READ_MODIFY_WRITE           = 0x00020000,
+        READ_MODIFY_WRITE           = 0x0020000000000000,
 
         /** The request is a prefetch. */
         PREFETCH                    = 0x01000000,
@@ -292,8 +292,8 @@ class Request : public Extensible<Request>
 
     /**
      * These bits are used to set the coherence policy for the GPU and are
-     * encoded in the GCN3 instructions. The GCN3 ISA defines two cache levels
-     * See the AMD GCN3 ISA Architecture Manual for more details.
+     * encoded in the Vega instructions. The Vega ISA defines two cache levels
+     * See the AMD Vega ISA Architecture Manual for more details.
      *
      * INV_L1: L1 cache invalidation
      * FLUSH_L2: L2 cache flush
@@ -1096,6 +1096,7 @@ class Request : public Extensible<Request>
      * setting extraFlags should be done via setCacheCoherenceFlags().
      */
     bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); }
+    bool isInvL2() const { return _cacheCoherenceFlags.isSet(GL2_CACHE_INV); }
 
     bool
     isGL2CacheFlush() const
diff --git a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
index bdc5d73f20..67c7753f09 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
@@ -60,6 +60,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
     // Mem sys initiated
     Repl,           desc="Replacing block from cache";
     Data,           desc="Received Data";
+    Evict,          desc="Evict cache line";
   }
 
   enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
@@ -67,6 +68,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
     DataArrayWrite,   desc="Write the data array";
     TagArrayRead,     desc="Read the data array";
     TagArrayWrite,    desc="Write the data array";
+    TagArrayFlash,    desc="Flash clear the data array";
   }
 
 
@@ -242,7 +244,12 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
       peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
         Entry cache_entry := getCacheEntry(in_msg.LineAddress);
         TBE tbe := TBEs.lookup(in_msg.LineAddress);
-        trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        if (in_msg.Type == RubyRequestType:REPLACEMENT) {
+          trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+        } else {
+          trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+        }
       }
     }
   }
@@ -313,6 +320,10 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
     APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
   }
 
+  action(inv_invDone, "inv", desc="local inv done") {
+    sequencer.invL1Callback();
+  }
+
   action(w_writeCache, "w", desc="write data to cache") {
     peek(responseToSQC_in, ResponseMsg) {
       assert(is_valid(cache_entry));
@@ -350,6 +361,13 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
     ic_invCache;
   }
 
+  transition({I, IV, V}, Evict, I) {TagArrayRead, TagArrayWrite} {
+    // since we're evicting something, don't bother classifying as hit/miss
+    ic_invCache;
+    inv_invDone;
+    p_popMandatoryQueue;
+  }
+
   // if we got a response for a load where the line is in I, then
   // another request must have come in that replaced the line in question in
   // the cache.  Thus, complete this request without allocating the line, but
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
index e599d2f937..da4318bcf9 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -72,6 +72,7 @@ machine(MachineType:TCC, "TCC Cache")
     L2_Repl,                desc="L2 Replacement";
     // Probes
     PrbInv,                 desc="Invalidating probe";
+    InvCache,               desc="Invalidating probe from TCP";
     // Coming from Memory Controller
     WBAck,                  desc="writethrough ack from memory";
     Bypass,                 desc="Bypass the entire L2 cache";
@@ -117,6 +118,8 @@ machine(MachineType:TCC, "TCC Cache")
     int numPending,                  desc="num pending requests";
     int numPendingDirectoryAtomics,  desc="number of pending atomics to be performed in directory";
     int atomicDoneCnt,               desc="number AtomicDones triggered";
+    bool atomicDataReturn,           desc="Got Atomic op and need return value?", default="false";
+    bool atomicDataNoReturn,         desc="Got Atomic op and don't need return value?", default="false";
     bool isGLCSet,                   desc="Bypass L1 Cache";
     bool isSLCSet,                   desc="Bypass L1 and L2 Cache";
     WriteMask atomicWriteMask,       desc="Atomic write mask";
@@ -411,6 +414,8 @@ machine(MachineType:TCC, "TCC Cache")
           }
         } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
             trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:InvCache) {
+            trigger(Event:InvCache, in_msg.addr, cache_entry, tbe);
         } else {
           DPRINTF(RubySlicc, "%s\n", in_msg);
           error("Unexpected Response Message to Core");
@@ -427,6 +432,19 @@ machine(MachineType:TCC, "TCC Cache")
     unset_cache_entry();
   }
 
+  action(ir_invL2Resp, "ir", desc="send L2 invalidate ack") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:InvL2Resp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
   action(sd_sendData, "sd", desc="send Shared response") {
     peek(coreRequestNetwork_in, CPURequestMsg) {
       enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
@@ -588,6 +606,7 @@ machine(MachineType:TCC, "TCC Cache")
           out_msg.Destination.add(in_msg.Requestor);
           out_msg.Sender := machineID;
           out_msg.MessageSize := MessageSizeType:Response_Data;
+          out_msg.instSeqNum := in_msg.instSeqNum;
           out_msg.DataBlk := cache_entry.DataBlk;
           out_msg.isGLCSet := in_msg.isGLCSet;
           out_msg.isSLCSet := in_msg.isSLCSet;
@@ -680,6 +699,8 @@ machine(MachineType:TCC, "TCC Cache")
           tbe.atomicWriteMask.clear();
           tbe.atomicWriteMask.orMask(in_msg.writeMask);
         }
+        tbe.atomicDataReturn := in_msg.Type == CoherenceRequestType:AtomicReturn;
+        tbe.atomicDataNoReturn := in_msg.Type == CoherenceRequestType:AtomicNoReturn;
       }
     }
   }
@@ -878,12 +899,12 @@ machine(MachineType:TCC, "TCC Cache")
 
   action(pa_performAtomic, "pa", desc="Perform atomic") {
     peek(coreRequestNetwork_in, CPURequestMsg) {
-      if (in_msg.Type == CoherenceRequestType:AtomicReturn) {
+      if ((is_valid(tbe) && tbe.atomicDataReturn) || in_msg.Type == CoherenceRequestType:AtomicReturn) {
         cache_entry.DataBlk.atomicPartial(cache_entry.DataBlk, cache_entry.writeMask, false);
       } else {
         // Set the isAtomicNoReturn flag to ensure that logs are not
         // generated erroneously
-        assert(in_msg.Type == CoherenceRequestType:AtomicNoReturn);
+        assert((is_valid(tbe) && tbe.atomicDataNoReturn) || in_msg.Type == CoherenceRequestType:AtomicNoReturn);
         cache_entry.DataBlk.atomicPartial(cache_entry.DataBlk, cache_entry.writeMask, true);
       }
     }
@@ -1106,7 +1127,7 @@ machine(MachineType:TCC, "TCC Cache")
     st_stallAndWaitRequest;
   }
 
-  transition(I, WrVicBlk) {TagArrayRead} {
+  transition(I, {WrVicBlk, WrVicBlkEvict}) {TagArrayRead} {
     p_profileMiss;
     wt_writeThrough;
     p_popRequestQueue;
@@ -1184,6 +1205,12 @@ machine(MachineType:TCC, "TCC Cache")
     i_invL2;
   }
 
+  transition({I, V}, InvCache, I) {TagArrayRead, TagArrayWrite} {
+    i_invL2;
+    ir_invL2Resp;
+    p_popRequestQueue;
+  }
+
   transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
     pi_sendProbeResponseInv;
     pp_popProbeQueue;
@@ -1273,6 +1300,7 @@ machine(MachineType:TCC, "TCC Cache")
     wardb_writeAtomicResponseDirtyBytes;
     pa_performAtomic;
     baplr_sendBypassedAtomicPerformedLocallyResponse;
+    wada_wakeUpAllDependentsAddr;
     dt_deallocateTBE;
     pr_popResponseQueue;
   }
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 97997a12b5..0d740ef473 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -75,6 +75,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     Evict,           desc="Evict if clean(invL1 for Load Acquire)";
     // Mem sys initiated
     Repl,            desc="Replacing block from cache";
+    InvL2,           desc="Invalidate to L2";
+    InvL2Resp,       desc="Invalidate L2 completed";
 
     // TCC initiated
     TCC_Ack,         desc="TCC Ack to Core Request";
@@ -286,9 +288,12 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
                      in_msg.Type == CoherenceResponseType:NBSysWBAck) {
             trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
             DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
-          } else {
-            error("Unexpected Response Message to Core");
-          }
+        } else if (in_msg.Type == CoherenceResponseType:InvL2Resp) {
+            DPRINTF(RubySlicc, "Issuing InvL2Resp\n");
+            trigger(Event:InvL2Resp, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
       }
     }
   }
@@ -333,6 +338,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
             trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
         } else if (in_msg.Type == RubyRequestType:REPLACEMENT){
             trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:InvL2){
+            trigger(Event:InvL2, in_msg.LineAddress, cache_entry, tbe);
         } else {
           error("Unexpected Request Message from VIC");
         }
@@ -442,6 +449,28 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     }
   }
 
+  action(wtna_writeThroughNoAlloc, "wtna", desc="Write through without allocation") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.DataBlk.copyPartial(in_msg.WTData, in_msg.writeMask);
+        out_msg.writeMask := in_msg.writeMask;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:WriteThrough;
+        out_msg.InitialRequestTime := curCycle();
+        out_msg.Shared := false;
+
+        // forward inst sequence number to lower TCC
+        out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
+    }
+  }
+
   action(at_atomicThrough, "at", desc="send Atomic") {
     peek(mandatoryQueue_in, RubyRequest) {
       enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
@@ -590,6 +619,19 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     cache_entry.Dirty := true;
   }
 
+  action(sna_storeDoneMissNoAlloc, "sna", desc="local store done (misses in TCP)") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      // writeCallback requires pass-by-reference and in_msg.WTData is a const value.
+      DataBlock tmp := in_msg.WTData;
+
+      if (use_seq_not_coal) {
+          sequencer.writeCallback(address, tmp, false, MachineType:L1Cache);
+      } else {
+          coalescer.writeCallback(address, MachineType:L1Cache, tmp);
+      }
+    }
+  }
+
   action(f_flushDone, "f", desc="flush done") {
     assert(is_valid(cache_entry));
 
@@ -609,6 +651,31 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     }
   }
 
+  action(il2_invL2, "il2", desc="Invalidate address in L2") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:InvCache;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
+    }
+  }
+
+  action(i2r_invL2Resp, "i2r", desc="Invalidate L2 completed") {
+    if (use_seq_not_coal) {
+        DPRINTF(RubySlicc, "Sequencer does not define invTCCCallback!\n");
+        assert(false);
+    } else {
+      coalescer.invTCCCallback(address);
+    }
+  }
+
   action(wd_wtDone, "wd", desc="writethrough done") {
     if (use_seq_not_coal) {
       DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
@@ -723,12 +790,9 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   }
 
   transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
-    a_allocate;
-    dw_dirtyWrite;
-    s_storeDoneMiss;
+    sna_storeDoneMissNoAlloc;
     uu_profileDataMiss;
-    wt_writeThrough;
-    ic_invCache;
+    wtna_writeThroughNoAlloc;
     p_popMandatoryQueue;
   }
 
@@ -830,6 +894,22 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     p_popMandatoryQueue;
   }
 
+  transition(I, InvL2) {
+    il2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(V, InvL2, I) {
+    ic_invCache
+    il2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, InvL2Resp) {
+    i2r_invL2Resp;
+    pr_popResponseQueue;
+  }
+
   // if a line is in IV and a TCC_AckWB comes back, we must have had a WT
   // store followed by a load. Thus, complete the store without affecting
   // TBE or line state.
diff --git a/src/mem/ruby/protocol/GPU_VIPER-msg.sm b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
index 9074a86b52..106433f2c5 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-msg.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
@@ -62,6 +62,7 @@ structure (VIPERCoalescer, external = "yes") {
                      Cycles, Cycles, Cycles, bool);
   void atomicCallback(Addr, MachineType, DataBlock);
   void invTCPCallback(Addr);
+  void invTCCCallback(Addr);
   void writeCompleteCallback(Addr, uint64_t);
   void evictionCallback(Addr);
 }
diff --git a/src/mem/ruby/protocol/MESI_Three_Level-msg.sm b/src/mem/ruby/protocol/MESI_Three_Level-msg.sm
index a16e374fd6..740011ac83 100644
--- a/src/mem/ruby/protocol/MESI_Three_Level-msg.sm
+++ b/src/mem/ruby/protocol/MESI_Three_Level-msg.sm
@@ -81,8 +81,9 @@ structure(CoherenceMsg, desc="...", interface="Message") {
   PrefetchBit Prefetch,         desc="Is this a prefetch request";
 
   bool functionalRead(Packet *pkt) {
-    // Only PUTX messages contains the data block
-    if (Class == CoherenceClass:PUTX) {
+    // Valid data block is only present in message with following types
+    if (Class == CoherenceClass:PUTX ||
+        Class == CoherenceClass:DATA_EXCLUSIVE) {
         return testAndRead(addr, DataBlk, pkt);
     }
 
diff --git a/src/mem/ruby/protocol/MI_example-dir.sm b/src/mem/ruby/protocol/MI_example-dir.sm
index bbaa7d0789..c57cdad4ab 100644
--- a/src/mem/ruby/protocol/MI_example-dir.sm
+++ b/src/mem/ruby/protocol/MI_example-dir.sm
@@ -535,7 +535,7 @@ machine(MachineType:Directory, "Directory protocol")
     z_recycleRequestQueue;
   }
 
-  transition({IM, MI, ID, ID_W}, {DMA_READ, DMA_WRITE} ) {
+  transition({IM, MI, ID, ID_W, M_DRDI}, {DMA_READ, DMA_WRITE} ) {
     y_recycleDMARequestQueue;
   }
 
@@ -651,4 +651,9 @@ machine(MachineType:Directory, "Directory protocol")
     b_sendWriteBackNack;
     i_popIncomingRequestQueue;
   }
+
+  transition({M_DWR, M_DRD, M_DRDI, M_DWRI}, PUTX_NotOwner) {
+    b_sendWriteBackNack;
+    i_popIncomingRequestQueue;
+  }
 }
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-CorePair.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-CorePair.sm
index 7d1bde04dd..0e12b0a9e4 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-CorePair.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-CorePair.sm
@@ -730,7 +730,7 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
       out_msg.DataBlk := cache_entry.DataBlk;
       assert(cache_entry.Dirty);
       out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
-      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.MessageSize := MessageSizeType:Writeback_Data;
       out_msg.Type := CoherenceRequestType:VicDirty;
       out_msg.InitialRequestTime := curCycle();
       if (cache_entry.CacheState == State:O) {
@@ -1114,27 +1114,6 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
     }
   }
 
-  action(wb_data, "wb", desc="write back data") {
-    peek(responseToCore_in, ResponseMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
-        out_msg.addr := address;
-        out_msg.Type := CoherenceResponseType:CPUData;
-        out_msg.Sender := machineID;
-        out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
-        out_msg.DataBlk := tbe.DataBlk;
-        out_msg.Dirty := tbe.Dirty;
-        if (tbe.Shared) {
-          out_msg.NbReqShared := true;
-        } else {
-          out_msg.NbReqShared := false;
-        }
-        out_msg.State := CoherenceState:Shared; // faux info
-        out_msg.MessageSize := MessageSizeType:Writeback_Data;
-        DPRINTF(RubySlicc, "%s\n", out_msg);
-      }
-    }
-  }
-
   action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
     enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
       out_msg.addr := address;
@@ -2427,19 +2406,16 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
   }
 
   transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} {
-    wb_data;
     d_deallocateTBE;
     pr_popResponseQueue;
   }
 
   transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} {
-    wb_data;
     d_deallocateTBE;
     pr_popResponseQueue;
   }
 
   transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} {
-    wb_data;
     i2_invL2;
     a2_allocateL2;
     d_deallocateTBE; // FOO
@@ -2448,7 +2424,6 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
   }
 
   transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} {
-    wb_data;
     i2_invL2;
     a2_allocateL2;
     d_deallocateTBE; // FOO
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
index c36fc9ec93..17a92f5f90 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -101,7 +101,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     // writebacks
     VicDirty,           desc="...";
     VicClean,           desc="...";
-    CPUData,            desc="WB data from CPU";
     StaleWB,         desc="Notification that WB has been superceded by a probe";
 
     // probe responses
@@ -361,8 +360,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
         if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
           trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
-        } else if (in_msg.Type == CoherenceResponseType:CPUData) {
-          trigger(Event:CPUData, in_msg.addr, entry, tbe);
         } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
             trigger(Event:StaleWB, in_msg.addr, entry, tbe);
         } else {
@@ -968,7 +965,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
   }
 
   action(d_writeDataToMemory, "d", desc="Write data to memory") {
-    peek(responseNetwork_in, ResponseMsg) {
+    peek(requestNetwork_in, CPURequestMsg) {
       enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
         out_msg.addr := address;
         out_msg.Type := MemoryRequestType:MEMORY_WB;
@@ -1175,12 +1172,12 @@ machine(MachineType:Directory, "AMD Baseline protocol")
   }
 
   action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
-    peek(responseNetwork_in, ResponseMsg) {
+    peek(requestNetwork_in, CPURequestMsg) {
       if (L3CacheMemory.isTagPresent(address)) {
         CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
         APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
         entry.DataBlk := in_msg.DataBlk;
-        entry.LastSender := in_msg.Sender;
+        entry.LastSender := in_msg.Requestor;
         assert(is_valid(tbe));
         //The controller always allocates a TBE entry upon receipt of a request from L2 caches.
         //L3Hit flag is used by the hit profiling action pr_profileL3HitMiss to determine hit or miss.
@@ -1205,7 +1202,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
         entry.DataBlk := in_msg.DataBlk;
 
-        entry.LastSender := in_msg.Sender;
+        entry.LastSender := in_msg.Requestor;
       }
     }
   }
@@ -1397,15 +1394,25 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     p_popRequestQueue;
   }
 
-  transition(U, VicDirty, BL) {L3TagArrayRead} {
+  transition(U, VicDirty) {L3TagArrayRead, L3DataArrayWrite} {
     t_allocateTBE;
     w_sendResponseWBAck;
+    d_writeDataToMemory;
+    al_allocateL3Block;
+    pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
+    wada_wakeUpAllDependentsAddr;
+    dt_deallocateTBE;
     p_popRequestQueue;
   }
 
-  transition(U, VicClean, BL) {L3TagArrayRead} {
+  transition(U, VicClean) {L3TagArrayRead, L3DataArrayWrite} {
     t_allocateTBE;
     w_sendResponseWBAck;
+    d_writeDataToMemory;
+    al_allocateL3Block;
+    pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
+    wada_wakeUpAllDependentsAddr;
+    dt_deallocateTBE;
     p_popRequestQueue;
   }
 
@@ -1413,17 +1420,12 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     zz_recycleRequestQueue;
   }
 
-  transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
-    d_writeDataToMemory;
-    al_allocateL3Block;
-    pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
+  transition(BL, StaleWB, U) {L3TagArrayWrite} {
     wada_wakeUpAllDependentsAddr;
-    dt_deallocateTBE;
     pr_popResponseQueue;
   }
 
-  transition(BL, StaleWB, U) {L3TagArrayWrite} {
-    dt_deallocateTBE;
+  transition(U, StaleWB, U) {L3TagArrayWrite} {
     wada_wakeUpAllDependentsAddr;
     pr_popResponseQueue;
   }
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
index b860ff1681..cb5a8c3a95 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -46,6 +46,7 @@ enumeration(CoherenceRequestType, desc="Coherence Request Types") {
   WriteThroughFifo, desc="WriteThrough with no data";
   WriteThroughDummy, desc="WriteThrough with no data for atomic operation";
   WriteFlush,   desc="Release Flush";
+  InvCache,     desc="Invalidate Cache";
 
   WrCancel,     desc="want to cancel WB to Memory"; // should this be here?
 
@@ -95,6 +96,7 @@ enumeration(CoherenceResponseType, desc="Coherence Response Types") {
   StaleNotif,      desc="Notification of Stale WBAck, No data to writeback";
   CPUCancelWB,     desc="want to cancel WB to Memory";
   MemData,         desc="Data from Memory";
+  InvL2Resp,       desc="Invalidate L2 response";
 
   // for regions
   PrivateAck,      desc="Ack that r-buf received private notify";
diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm
index ca44fd3780..5a7324cb72 100644
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -181,6 +181,7 @@ enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") {
   COMMIT,            desc="Commit version";
   NULL,              desc="Invalid request type";
   FLUSH,             desc="Flush request type";
+  InvL2,             desc="Invalidate L2";
   Release,           desc="Release operation";
   Acquire,           desc="Acquire opertion";
   AcquireRelease,    desc="Acquire and Release opertion";
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index 2206effa29..4e0e4f4511 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021,2023 ARM Limited
+ * Copyright (c) 2020-2021,2023-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -157,6 +157,9 @@ structure (Sequencer, external = "yes") {
   void llscClearLocalMonitor();
 
   void evictionCallback(Addr);
+
+  void invL1Callback();
+
   void recordRequestType(SequencerRequestType);
   bool checkResourceAvailable(CacheResourceType, Addr);
 }
@@ -186,6 +189,7 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
   Addr tlbiTransactionUid,   desc="Unique identifier of the TLB shootdown operation that produced this request";
   bool isGLCSet,             default="false",desc="If flag is set, bypass GPU L1 cache";
   bool isSLCSet,             default="false",desc="If flag is set, bypass GPU L1 and L2 caches";
+  bool isSecure,             default="false",desc="If flag is set, request is in secure PA space";
 
   RequestPtr getRequestPtr();
 }
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-actions.sm b/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
index 07344718df..4f7c03bde4 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 ARM Limited
+ * Copyright (c) 2021-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -147,6 +147,7 @@ action(AllocateTBE_SeqRequest, desc="") {
       out_msg.is_local_pf := false;
       out_msg.is_remote_pf := false;
       out_msg.txnId := max_outstanding_transactions;
+      out_msg.ns := !in_msg.isSecure;
 
       out_msg.atomic_op.clear();
       out_msg.atomic_op.orMask(in_msg.writeMask);
@@ -236,6 +237,7 @@ action(AllocateTBE_PfRequest, desc="Allocate TBE for prefetch request") {
       assert(in_msg.Prefetch != PrefetchBit:No);
       out_msg.is_local_pf := true;
       out_msg.is_remote_pf := false;
+      out_msg.txnId := max_outstanding_transactions;
 
       if (in_msg.Type == RubyRequestType:LD) {
         out_msg.type := CHIRequestType:Load;
@@ -400,6 +402,12 @@ action(Initiate_Replacement, desc="") {
   }
   copyCacheAndDir(cache_entry, getDirEntry(address), tbe, initial);
 
+  // UD_RU line needs to be written back because the upstream may be in UC state.
+  if (initial == State:UD_RU) {
+    tbe.dataBlk := cache_entry.DataBlk;
+    tbe.dataBlkValid.fillMask();
+  }
+
   // model the initial tag array read
   tbe.actions.pushNB(Event:TagArrayRead);
 
@@ -675,6 +683,42 @@ action(Initiate_ReadUnique_Hit_InvUpstream, desc="") {
   tbe.actions.pushNB(Event:TagArrayWrite);
 }
 
+action(Initiate_MakeReadUnique_Hit_InvUpstream, desc="") {
+  assert(tbe.reqType == CHIRequestType:MakeReadUnique);
+  tbe.actions.push(Event:ReadHitPipe);
+  if (tbe.dir_sharers.count() > 1) {
+    tbe.actions.push(Event:SendSnpCleanInvalidNoReq);
+  }
+  if (!tbe.dataUnique && !is_HN) {
+    tbe.actions.push(Event:SendMakeReadUnique);
+  }
+  tbe.actions.pushNB(Event:DataArrayRead);
+  tbe.actions.push(Event:FinishMakeReadUnique);
+}
+
+action(Finish_MakeReadUnique, desc="") {
+  // Choosing whether to send or not data back (CompData vs Comp)
+  // to the requestor. Data will be forwarded if the requestor
+  // lost the cacheline while the transaction was in progress
+  if (tbe.dataValid && tbe.dir_sharers.isElement(tbe.requestor) == false) {
+    tbe.actions.pushNB(Event:SendCompData);
+  } else {
+    if (tbe.dataDirty) {
+      // Got dirty data from snoopees
+      tbe.actions.pushNB(Event:SendCompUDResp);
+    } else {
+      tbe.actions.pushNB(Event:SendCompUCResp);
+    }
+  }
+  tbe.dataUnique := true;
+  tbe.dataMaybeDirtyUpstream := true;
+  tbe.requestorToBeExclusiveOwner := true;
+  tbe.dir_ownerExists := false;
+
+  tbe.actions.pushNB(Event:TagArrayWrite);
+  tbe.actions.push(Event:WaitCompAck);
+}
+
 action(Initiate_CleanUnique, desc="") {
   tbe.actions.push(Event:ReadMissPipe); // TODO need another latency pipe ??
 
@@ -816,7 +860,8 @@ action(Initiate_AtomicReturn_I, desc="") {
     tbe.atomic_to_be_done := true;
   } else if ((policy_type == 1) || // UNIQUE NEAR
              (policy_type == 2)) { // PRESENT NEAR
-    tbe.actions.push(Event:SendAtomicReturn_NoWait);
+    tbe.actions.push(Event:SendAtomicReturn);
+    tbe.actions.push(Event:SendARData);
     tbe.dataToBeInvalid := true;
     tbe.doCacheFill := false;
     tbe.atomic_to_be_done := false;
@@ -833,7 +878,8 @@ action(Initiate_AtomicNoReturn_I, desc="") {
     tbe.actions.push(Event:WriteBEPipe);
     tbe.actions.push(Event:TagArrayWrite);
     tbe.atomic_to_be_done := true;
-  } else if (policy_type == 1) { // UNIQUE NEAR
+  } else if ((policy_type == 1) || // UNIQUE NEAR
+             (policy_type == 2)) { // PRESENT NEAR
     tbe.actions.push(Event:SendAtomicNoReturn);
     tbe.actions.push(Event:SendANRData);
     tbe.dataToBeInvalid := true;
@@ -845,7 +891,8 @@ action(Initiate_AtomicNoReturn_I, desc="") {
 }
 
 action(Initiate_AtomicReturn_SD, desc="") {
-  if (policy_type == 0){ // ALL NEAR
+  if ((policy_type == 0) || // ALL NEAR
+      (policy_type == 2)) { // PRESENT NEAR
     tbe.actions.push(Event:SendReadUnique);
     tbe.actions.push(Event:WriteFEPipe);
     tbe.actions.push(Event:CheckCacheFill);
@@ -853,7 +900,8 @@ action(Initiate_AtomicReturn_SD, desc="") {
     tbe.actions.push(Event:TagArrayWrite);
     tbe.atomic_to_be_done := true;
   } else if (policy_type == 1) { // UNIQUE NEAR
-    tbe.actions.push(Event:SendAtomicReturn_NoWait);
+    tbe.actions.push(Event:SendAtomicReturn);
+    tbe.actions.push(Event:SendARData);
     tbe.dataToBeInvalid := true;
     tbe.doCacheFill := false;
     tbe.atomic_to_be_done := false;
@@ -863,7 +911,8 @@ action(Initiate_AtomicReturn_SD, desc="") {
 }
 
 action(Initiate_AtomicNoReturn_SD, desc="") {
-  if (policy_type == 0){ // ALL NEAR
+  if ((policy_type == 0) || // ALL NEAR
+      (policy_type == 2)) { // PRESENT NEAR
     tbe.actions.push(Event:SendReadUnique);
     tbe.actions.push(Event:WriteFEPipe);
     tbe.actions.push(Event:CheckCacheFill);
@@ -882,7 +931,8 @@ action(Initiate_AtomicNoReturn_SD, desc="") {
 }
 
 action(Initiate_AtomicReturn_SC, desc="") {
-  if (policy_type == 0){ // ALL NEAR
+  if ((policy_type == 0) || // ALL NEAR
+      (policy_type == 2)) { // PRESENT NEAR
     tbe.actions.push(Event:SendReadUnique);
     tbe.actions.push(Event:WriteFEPipe);
     tbe.actions.push(Event:CheckCacheFill);
@@ -890,7 +940,8 @@ action(Initiate_AtomicReturn_SC, desc="") {
     tbe.actions.push(Event:TagArrayWrite);
     tbe.atomic_to_be_done := true;
   } else if (policy_type == 1) { // UNIQUE NEAR
-    tbe.actions.push(Event:SendAtomicReturn_NoWait);
+    tbe.actions.push(Event:SendAtomicReturn);
+    tbe.actions.push(Event:SendARData);
     tbe.dataToBeInvalid := true;
     tbe.doCacheFill := false;
     tbe.atomic_to_be_done := false;
@@ -900,7 +951,8 @@ action(Initiate_AtomicReturn_SC, desc="") {
 }
 
 action(Initiate_AtomicNoReturn_SC, desc="") {
-  if (policy_type == 0){ // ALL NEAR
+  if ((policy_type == 0) || // ALL NEAR
+      (policy_type == 2)) { // PRESENT NEAR
     tbe.actions.push(Event:SendReadUnique);
     tbe.actions.push(Event:WriteFEPipe);
     tbe.actions.push(Event:CheckCacheFill);
@@ -927,6 +979,25 @@ action(Initiate_StoreUpgrade, desc="") {
   tbe.actions.push(Event:TagArrayWrite);
 }
 
+action(Initiate_WriteUnique_Zero, desc="") {
+  assert(is_HN);
+  tbe.dataUnique := true;
+  if (tbe.dir_sharers.count() > 0) {
+    tbe.actions.push(Event:SendSnpCleanInvalid);
+  }
+  tbe.actions.push(Event:WriteZero);
+  if (comp_wu) {
+    tbe.actions.push(Event:SendDBIDResp_WUZ);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+    tbe.actions.pushNB(Event:SendComp_WU);
+  } else {
+    tbe.actions.push(Event:SendCompDBIDResp);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+  }
+  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:TagArrayWrite);
+}
+
 action(Initiate_WriteUnique_LocalWrite, desc="") {
   // auto-upgrade if hn but state was not unique
   assert(is_HN || tbe.dataUnique);
@@ -1023,14 +1094,30 @@ action(Initiate_AtomicReturn_LocalWrite, desc="") {
   }
   tbe.actions.push(Event:SendDBIDResp_AR);
   tbe.actions.pushNB(Event:WriteFEPipe);
-  tbe.actions.pushNB(Event:SendCompData_AR);
-  tbe.actions.push(Event:WriteFEPipe);
-  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:SendCompData_AR);
   tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:CheckCacheFill);
   tbe.actions.push(Event:WriteBEPipe);
   tbe.actions.push(Event:TagArrayWrite);
 }
 
+action(Initiate_AtomicReturn_WriteBack, desc="") {
+  if ((tbe.dir_sharers.count() > 0) && tbe.dataMaybeDirtyUpstream) {
+    tbe.actions.push(Event:SendSnpUnique);
+  } else if (tbe.dir_sharers.count() > 0){
+    // no one will send us data unless we explicitly ask
+    tbe.actions.push(Event:SendSnpUniqueRetToSrc);
+  }
+  tbe.actions.push(Event:SendDBIDResp_AR);
+  tbe.actions.pushNB(Event:WriteFEPipe);
+  tbe.actions.push(Event:SendCompData_AR);
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:SendWriteNoSnp);
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:SendWUData);
+  tbe.dataToBeInvalid := true;
+  tbe.actions.pushNB(Event:TagArrayWrite);
+}
 
 action(Initiate_AtomicNoReturn_LocalWrite, desc="") {
   if ((tbe.dir_sharers.count() > 0) && tbe.dataMaybeDirtyUpstream) {
@@ -1047,37 +1134,56 @@ action(Initiate_AtomicNoReturn_LocalWrite, desc="") {
     tbe.actions.push(Event:SendCompDBIDResp_ANR);
     tbe.actions.pushNB(Event:WriteFEPipe);
   }
-  tbe.actions.push(Event:WriteFEPipe);
-  tbe.actions.push(Event:CheckCacheFill);
   tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:CheckCacheFill);
   tbe.actions.push(Event:WriteBEPipe);
   tbe.actions.push(Event:TagArrayWrite);
 }
 
+action(Initiate_AtomicNoReturn_WriteBack, desc="") {
+  if ((tbe.dir_sharers.count() > 0) && tbe.dataMaybeDirtyUpstream) {
+    tbe.actions.push(Event:SendSnpUnique);
+  } else if (tbe.dir_sharers.count() > 0){
+    // no one will send us data unless we explicitly ask
+    tbe.actions.push(Event:SendSnpUniqueRetToSrc);
+  }
+  if (comp_anr) {
+    tbe.actions.push(Event:SendDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+    tbe.actions.pushNB(Event:SendComp_ANR);
+  } else {
+    tbe.actions.push(Event:SendCompDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+  }
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:SendWriteNoSnp);
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:SendWUData);
+  tbe.dataToBeInvalid := true;
+  tbe.actions.pushNB(Event:TagArrayWrite);
+}
 
 action(Initiate_AtomicReturn_Forward, desc="") {
-  if ((tbe.dir_sharers.count() > 0) &&
-     (tbe.dir_sharers.isElement(tbe.requestor))){
-    tbe.dir_sharers.remove(tbe.requestor);
-  }
   tbe.actions.push(Event:SendAtomicReturn);
+  tbe.actions.pushNB(Event:WriteFEPipe);
+  tbe.actions.push(Event:SendDBIDResp_AR);
+  tbe.actions.push(Event:SendARData);
   tbe.actions.push(Event:SendCompData_AR);
+  tbe.actions.push(Event:WriteBEPipe);
   tbe.actions.pushNB(Event:TagArrayWrite);
 
   tbe.dataToBeInvalid := true;
 }
 
 action(Initiate_AtomicNoReturn_Forward, desc="") {
-  if ((tbe.dir_sharers.count() > 0) &&
-     (tbe.dir_sharers.isElement(tbe.requestor))){
-    tbe.dir_sharers.remove(tbe.requestor);
-  }
   if (comp_anr) {
     tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.pushNB(Event:WriteFEPipe);
     tbe.actions.push(Event:SendDBIDResp_ANR);
     tbe.actions.pushNB(Event:SendComp_ANR);
   } else {
     tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.pushNB(Event:WriteFEPipe);
     tbe.actions.push(Event:SendCompDBIDResp_ANR);
   }
   tbe.actions.push(Event:WriteBEPipe);
@@ -1087,19 +1193,48 @@ action(Initiate_AtomicNoReturn_Forward, desc="") {
   tbe.dataToBeInvalid := true;
 }
 
+action(Initiate_AtomicReturn_Miss_Alloc, desc="") {
+  tbe.actions.push(Event:SendReadNoSnp);
+  tbe.actions.pushNB(Event:WriteFEPipe);
+  tbe.actions.push(Event:SendDBIDResp_AR);
+  tbe.actions.push(Event:SendCompData_AR);
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:TagArrayWrite);
+}
+
 action(Initiate_AtomicReturn_Miss, desc="") {
   tbe.actions.push(Event:SendReadNoSnp);
   tbe.actions.pushNB(Event:WriteFEPipe);
   tbe.actions.push(Event:SendDBIDResp_AR);
-  tbe.actions.pushNB(Event:WriteFEPipe);
-  tbe.actions.pushNB(Event:SendCompData_AR);
-  tbe.actions.push(Event:WriteFEPipe);
-  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:SendCompData_AR);
   tbe.actions.push(Event:DelayAtomic);
   tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:SendWriteNoSnp);
+  tbe.actions.push(Event:SendWUData);
+  tbe.dataToBeInvalid := true;
+  tbe.actions.pushNB(Event:TagArrayWrite);
+}
+
+action(Initiate_AtomicNoReturn_Miss_Alloc, desc="") {
+  assert(is_HN);
+  tbe.actions.push(Event:SendReadNoSnp);
+  if (comp_anr) {
+    tbe.actions.push(Event:SendDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+    tbe.actions.pushNB(Event:SendComp_ANR);
+  } else {
+    tbe.actions.push(Event:SendCompDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+  }
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:WriteBEPipe);
   tbe.actions.push(Event:TagArrayWrite);
 }
 
+
 action(Initiate_AtomicNoReturn_Miss, desc="") {
   assert(is_HN);
   tbe.actions.push(Event:SendReadNoSnp);
@@ -1111,12 +1246,12 @@ action(Initiate_AtomicNoReturn_Miss, desc="") {
     tbe.actions.push(Event:SendCompDBIDResp_ANR);
     tbe.actions.pushNB(Event:WriteFEPipe);
   }
-
-  tbe.actions.push(Event:WriteFEPipe);
-  tbe.actions.push(Event:CheckCacheFill);
   tbe.actions.push(Event:DelayAtomic);
   tbe.actions.push(Event:WriteBEPipe);
-  tbe.actions.push(Event:TagArrayWrite);
+  tbe.actions.push(Event:SendWriteNoSnp);
+  tbe.actions.push(Event:SendWUData);
+  tbe.dataToBeInvalid := true;
+  tbe.actions.pushNB(Event:TagArrayWrite);
 }
 
 action(Initiate_CopyBack, desc="") {
@@ -1489,6 +1624,28 @@ action(Send_ReadUnique, desc="") {
   }
 }
 
+action(Send_MakeReadUnique, desc="") {
+  assert((tbe.dataValid && tbe.dataUnique) == false);
+
+  assert(tbe.expected_req_resp.hasExpected() == false);
+  clearExpectedReqResp(tbe);
+  tbe.expected_req_resp.addExpectedDataType(CHIDataType:DataSepResp_UC);
+  tbe.expected_req_resp.addExpectedDataType(CHIDataType:CompData_UC);
+  tbe.expected_req_resp.addExpectedDataType(CHIDataType:CompData_UD_PD);
+  // NOTE: the first CompData received counts as RespSepData
+  tbe.expected_req_resp.addExpectedRespType(CHIResponseType:RespSepData);
+  tbe.expected_req_resp.addExpectedRespType(CHIResponseType:Comp_UC);
+  tbe.expected_req_resp.addExpectedRespType(CHIResponseType:Comp_UD_PD);
+  tbe.expected_req_resp.setExpectedCount(2);
+
+  enqueue(reqOutPort, CHIRequestMsg, request_latency) {
+    prepareRequest(tbe, CHIRequestType:MakeReadUnique, out_msg);
+    out_msg.Destination.add(mapAddressToDownstreamMachine(tbe.addr));
+    out_msg.dataToFwdRequestor := false;
+    allowRequestRetry(tbe, out_msg);
+  }
+}
+
 action(Send_CleanUnique, desc="") {
   assert(tbe.dataValid || (tbe.dir_sharers.count() > 0));
   assert(tbe.dataUnique == false);
@@ -1533,7 +1690,8 @@ action(Send_InvSnpResp, desc="") {
 action(Send_WriteBackOrWriteEvict, desc="") {
   assert(is_valid(tbe));
   assert(tbe.dataBlkValid.isFull());
-  assert(tbe.dataValid);
+  assert(tbe.dataValid ||
+         (tbe.dir_ownerIsExcl && tbe.dataUnique && tbe.dataDirty));
   assert(is_HN == false);
 
   assert(tbe.dataUnique || tbe.dataDirty);
@@ -1556,7 +1714,9 @@ action(Send_WriteBackOrWriteEvict, desc="") {
 action(Send_WriteCleanFull, desc="") {
   assert(is_valid(tbe));
   assert(tbe.dataBlkValid.isFull());
-  assert(tbe.dataValid);
+  // Data must be valid or the line was in UD_RU state.
+  assert(tbe.dataValid ||
+         (tbe.dir_ownerIsExcl && tbe.dataUnique && tbe.dataDirty));
   assert(is_HN == false);
   assert(tbe.dataDirty);
 
@@ -1582,6 +1742,9 @@ action(Send_WriteNoSnp, desc="") {
   // so addExpectedCount
   tbe.expected_req_resp.addExpectedRespType(CHIResponseType:CompDBIDResp);
   tbe.expected_req_resp.addExpectedCount(1);
+
+  // If we are WB after AtomicReturn/NoReturn
+  tbe.atomic_to_be_wb := true;
 }
 
 action(Send_WriteNoSnp_Partial, desc="") {
@@ -1635,18 +1798,6 @@ action(Send_AtomicReturn, desc="") {
   tbe.expected_req_resp.addExpectedCount(1);
 }
 
-action(Send_AtomicReturn_NoWait, desc="") {
-  assert(is_valid(tbe));
-
-  enqueue(reqOutPort, CHIRequestMsg, request_latency) {
-    prepareRequestAtomic(tbe, CHIRequestType:AtomicReturn, out_msg);
-    out_msg.Destination.add(mapAddressToDownstreamMachine(tbe.addr));
-    allowRequestRetry(tbe, out_msg);
-  }
-
-  tbe.dataAMOValid := false;
-}
-
 action(Send_AtomicNoReturn, desc="") {
   assert(is_valid(tbe));
 
@@ -1670,6 +1821,7 @@ action(Send_SnpCleanInvalid, desc="") {
     prepareRequest(tbe, CHIRequestType:SnpCleanInvalid, out_msg);
     out_msg.Destination.addNetDest(tbe.dir_sharers);
     out_msg.retToSrc := false;
+    out_msg.ns := tbe.ns;
   }
   setExpectedForInvSnoop(tbe, false);
 }
@@ -1684,6 +1836,7 @@ action(Send_SnpCleanInvalid_NoReq, desc="") {
     // at least one sharer other than requestor
     assert(out_msg.Destination.count() > 0);
     out_msg.retToSrc := false;
+    out_msg.ns := tbe.ns;
     setExpectedForInvSnoop(tbe, false);
     tbe.expected_snp_resp.setExpectedCount(out_msg.Destination.count());
   }
@@ -1700,6 +1853,7 @@ action(Send_SnpUnique, desc="") {
     prepareRequest(tbe, CHIRequestType:SnpUnique, out_msg);
     out_msg.Destination.addNetDest(tbe.dir_sharers);
     out_msg.retToSrc := false;
+    out_msg.ns := tbe.ns;
   }
 }
 
@@ -1721,6 +1875,7 @@ action(Send_SnpUnique_RetToSrc, desc="") {
     prepareRequest(tbe, CHIRequestType:SnpUnique, out_msg);
     out_msg.Destination.add(dest);
     out_msg.retToSrc := true;
+    out_msg.ns := tbe.ns;
   }
   // if other sharers send with retToSrc=false to others
   if (tbe.dir_sharers.count() > 1) {
@@ -1729,6 +1884,7 @@ action(Send_SnpUnique_RetToSrc, desc="") {
       out_msg.Destination.addNetDest(tbe.dir_sharers);
       out_msg.Destination.remove(dest);
       out_msg.retToSrc := false;
+      out_msg.ns := tbe.ns;
     }
   }
 }
@@ -1748,6 +1904,7 @@ action(Send_SnpUniqueFwd, desc="") {
     prepareRequest(tbe, CHIRequestType:SnpUniqueFwd, out_msg);
     out_msg.Destination.addNetDest(tbe.dir_sharers);
     out_msg.retToSrc := false;
+    out_msg.ns := tbe.ns;
   }
 }
 
@@ -1769,6 +1926,7 @@ action(Send_SnpShared, desc="") {
     prepareRequest(tbe, CHIRequestType:SnpShared, out_msg);
     out_msg.Destination.add(tbe.dir_owner);
     out_msg.retToSrc := false;
+    out_msg.ns := tbe.ns;
   }
 }
 
@@ -1818,6 +1976,7 @@ action(Send_SnpSharedFwd_ToOwner, desc="") {
     }
     out_msg.Destination.add(tbe.dir_owner);
     out_msg.retToSrc := retToSrc;
+    out_msg.ns := tbe.ns;
   }
 }
 
@@ -1845,6 +2004,7 @@ action(Send_SnpSharedFwd_ToSharer, desc="") {
     // TODO should be random or the closest one to the fwd dest
     out_msg.Destination.add(tbe.dir_sharers.smallestElement());
     out_msg.retToSrc := retToSrc;
+    out_msg.ns := tbe.ns;
   }
 }
 
@@ -1878,6 +2038,7 @@ action(Send_SnpOnce, desc="") {
       out_msg.Destination.add(tbe.dir_sharers.smallestElement());
     }
     out_msg.retToSrc := true;
+    out_msg.ns := tbe.ns;
   }
 }
 
@@ -1911,6 +2072,7 @@ action(Send_SnpOnceFwd, desc="") {
       out_msg.Destination.add(tbe.dir_sharers.smallestElement());
     }
     out_msg.retToSrc := false;
+    out_msg.ns := tbe.ns;
   }
 }
 
@@ -1955,6 +2117,11 @@ action(ExpectComp, desc="") {
   tbe.expected_req_resp.addExpectedCount(1);
 }
 
+action(WriteZero, desc="") {
+  tbe.dataBlkValid.fillMask();
+  tbe.dataBlk.clear();
+}
+
 action(Receive_ReqDataResp, desc="") {
   assert(is_valid(tbe));
   assert(tbe.expected_req_resp.hasExpected());
@@ -2303,6 +2470,10 @@ action(UpdateDataState_FromADataResp, desc="") {
     tbe.dataDirty := true;
 
     DPRINTF(RubySlicc, "Atomic after %s\n", tbe.dataBlk);
+  } else if ((tbe.expected_req_resp.hasReceivedData()) &&
+             ((tbe.reqType == CHIRequestType:AtomicReturn) ||
+             (tbe.reqType == CHIRequestType:AtomicNoReturn))){
+    tbe.dataMaybeDirtyUpstream := false;
   }
   printTBEState(tbe);
 }
@@ -2470,10 +2641,6 @@ action(Receive_ReqResp_WUNeedComp, desc="") {
   tbe.defer_expected_comp := true;
 }
 
-action(Receive_ReqResp_AR, desc="") {
-  tbe.actions.pushFrontNB(Event:SendARData);
-}
-
 action(Receive_ReqResp_WUComp, desc="") {
   if (tbe.defer_expected_comp) {
     tbe.defer_expected_comp := false;
@@ -2573,13 +2740,21 @@ action(Send_CompData, desc="") {
   bool is_rd_shared := (tbe.reqType == CHIRequestType:ReadShared) ||
                        (tbe.reqType == CHIRequestType:ReadNotSharedDirty);
   bool is_rd_nsd := tbe.reqType == CHIRequestType:ReadNotSharedDirty;
-  bool is_rd_unique := tbe.reqType == CHIRequestType:ReadUnique;
+  bool is_rd_unique := tbe.reqType == CHIRequestType:ReadUnique ||
+                       tbe.reqType == CHIRequestType:MakeReadUnique;
 
-  // if the config allows (or not caching the data) and line has no sharers
-  bool snd_unique_on_rs := (fwd_unique_on_readshared || tbe.dataToBeInvalid)
+  // Send UC/UD on ReadShared or ReadNotSharedDirty if the line has no sharers
+  // and one of the followings are met
+  // 1) the config allows or
+  // 2) local cache won't have the line or
+  // 3) Dirty will be passed but the request doesn't allow SD
+  bool snd_unique_on_rs := (fwd_unique_on_readshared ||
+                            tbe.dataToBeInvalid ||
+                            (tbe.dataDirty && is_rd_nsd))
                           && tbe.dataUnique && tbe.dir_sharers.isEmpty();
   // if the request type allows and we won't be caching the data
-  bool snd_dirty_on_rs := is_rd_shared && !is_rd_nsd && tbe.dataToBeInvalid;
+  bool snd_dirty_on_rs := (is_rd_shared && !is_rd_nsd) &&
+                          !tbe.dir_ownerExists;
 
   if (is_rd_once) {
     tbe.snd_msgType := CHIDataType:CompData_I;
@@ -2622,14 +2797,20 @@ action(Send_WBData, desc="") {
   if (is_HN) {
     assert(tbe.dataBlkValid.isFull());
     assert(tbe.dataDirty);
-    assert(tbe.dataValid);
+    assert(tbe.dataValid ||
+           (tbe.dir_ownerIsExcl && tbe.dataUnique && tbe.dataDirty));
     tbe.snd_msgType := CHIDataType:NCBWrData;
   } else {
     if (tbe.dataValid == false) {
-      // only possible when the WB was made stale by a snoop
-      assert(tbe.is_stale);
-      tbe.dataBlkValid.fillMask();
-      tbe.snd_msgType := CHIDataType:CBWrData_I;
+      // only possible when the WB was made stale by a snoop or
+      // Writeback on UD_RU line.
+      if (tbe.dir_ownerIsExcl && tbe.dataUnique && tbe.dataDirty) {
+        tbe.snd_msgType := CHIDataType:CBWrData_UD_PD;
+      } else {
+        assert(tbe.is_stale);
+        tbe.dataBlkValid.fillMask();
+        tbe.snd_msgType := CHIDataType:CBWrData_I;
+      }
     } else if (tbe.dataUnique) {
       assert(tbe.dataBlkValid.isFull());
       if (tbe.dataDirty) {
@@ -2683,16 +2864,17 @@ action(Send_ANRData, desc="") {
 
 action(CheckARComp, desc="") {
   assert(is_valid(tbe));
+  clearExpectedReqResp(tbe);
   tbe.expected_req_resp.addExpectedDataType(CHIDataType:CompData_I);
   tbe.expected_req_resp.addExpectedRespType(CHIResponseType:RespSepData);
-  tbe.expected_req_resp.addExpectedCount(2);
+  tbe.expected_req_resp.setExpectedCount(2);
 }
 
 action(CheckANRComp, desc="") {
   assert(is_valid(tbe));
   if (tbe.defer_expected_comp) {
     tbe.defer_expected_comp := false;
-    tbe.expected_req_resp.addExpectedCount(1);
+    tbe.expected_req_resp.setExpectedCount(1);
     tbe.expected_req_resp.addExpectedRespType(CHIResponseType:Comp);
   }
 }
@@ -2908,7 +3090,8 @@ action(Send_Data, desc="") {
     }
     tbe.snd_pendBytes.setMask(offset, range, false);
 
-    if (tbe.reqType == CHIRequestType:AtomicReturn){
+    if ((tbe.reqType == CHIRequestType:AtomicReturn) &&
+        (tbe.atomic_to_be_wb == false)){
         out_msg.dataBlk := tbe.oldDataBlk;
     } else {
         out_msg.dataBlk := tbe.dataBlk;
@@ -2938,11 +3121,13 @@ action(Send_RespSepData, desc="") {
 action(Send_CompI, desc="") {
   assert(is_valid(tbe));
 
-  // Used to ack Evict request
-  assert(tbe.dir_sharers.isElement(tbe.requestor));
-  assert((tbe.dir_ownerExists == false) || (tbe.dir_owner != tbe.requestor));
+  if (!isStashReqType(tbe.reqType)) {
+    // Used to ack Evict request
+    assert(tbe.dir_sharers.isElement(tbe.requestor));
+    assert((tbe.dir_ownerExists == false) || (tbe.dir_owner != tbe.requestor));
 
-  tbe.dir_sharers.remove(tbe.requestor);
+    tbe.dir_sharers.remove(tbe.requestor);
+  }
 
   enqueue(rspOutPort, CHIResponseMsg, response_latency) {
     out_msg.addr := address;
@@ -2981,6 +3166,18 @@ action(Send_CompUC_Stale, desc="") {
   }
 }
 
+action(Send_CompUD_PD, desc="") {
+  assert(is_valid(tbe));
+  enqueue(rspOutPort, CHIResponseMsg, response_latency) {
+    out_msg.addr := address;
+    out_msg.type := CHIResponseType:Comp_UD_PD;
+    out_msg.responder := machineID;
+    out_msg.Destination.add(tbe.requestor);
+    out_msg.txnId := tbe.txnId;
+    out_msg.dbid := tbe.txnId;
+  }
+}
+
 action(Send_CompAck, desc="") {
   assert(is_valid(tbe));
   enqueue(rspOutPort, CHIResponseMsg, response_latency) {
@@ -3059,10 +3256,6 @@ action(Send_CompData_AR, desc="") {
   assert(is_valid(tbe));
   assert(tbe.dataValid);
 
-  if (is_HN) {
-      tbe.oldDataBlk := tbe.dataBlk;
-  }
-
   tbe.snd_msgType := CHIDataType:CompData_I;
   tbe.dataMaybeDirtyUpstream := false;
   tbe.requestorToBeExclusiveOwner := false;
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm b/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
index 113f4c4871..a717ede4e9 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Arm Limited
+ * Copyright (c) 2021-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -430,6 +430,7 @@ TBE allocateRequestTBE(Addr addr, CHIRequestMsg in_msg), return_by_pointer="yes"
   } else {
     tbe.txnId := in_msg.txnId;
   }
+  tbe.ns := in_msg.ns;
 
   assert(tbe.is_snp_tbe == false);
   assert(tbe.is_repl_tbe == false);
@@ -449,6 +450,8 @@ TBE allocateRequestTBE(Addr addr, CHIRequestMsg in_msg), return_by_pointer="yes"
 
   tbe.atomic_op.clear();
   tbe.atomic_op.orMask(in_msg.atomic_op);
+  tbe.atomic_to_be_done := false;
+  tbe.atomic_to_be_wb := false;
 
   tbe.use_DMT := false;
   tbe.use_DCT := false;
@@ -796,13 +799,15 @@ bool needCacheEntry(CHIRequestType req_type,
       (enable_DMT && is_invalid(dir_entry) &&
        ((req_type == CHIRequestType:ReadShared) ||
         (req_type == CHIRequestType:ReadUnique) ||
+        (req_type == CHIRequestType:MakeReadUnique) ||
         (req_type == CHIRequestType:ReadOnce)))) {
     return false;
   } else {
     return is_prefetch ||
            (alloc_on_readshared && ((req_type == CHIRequestType:ReadShared) ||
                                     (req_type == CHIRequestType:ReadNotSharedDirty))) ||
-           (alloc_on_readunique && (req_type == CHIRequestType:ReadUnique)) ||
+           (alloc_on_readunique && ((req_type == CHIRequestType:ReadUnique) ||
+                                   (req_type == CHIRequestType:MakeReadUnique))) ||
            (alloc_on_readonce && (req_type == CHIRequestType:ReadOnce)) ||
            (alloc_on_writeback && ((req_type == CHIRequestType:WriteBackFull) ||
                                    (req_type == CHIRequestType:WriteCleanFull) ||
@@ -814,7 +819,7 @@ bool needCacheEntry(CHIRequestType req_type,
                                  (req_type == CHIRequestType:AtomicStore))) ||
            (alloc_on_seq_line_write && (req_type == CHIRequestType:StoreLine)) ||
            (alloc_on_atomic && ((req_type == CHIRequestType:AtomicReturn) ||
-                               (req_type == CHIRequestType:AtomicNoReturn)));
+                                 (req_type == CHIRequestType:AtomicNoReturn)));
   }
 }
 
@@ -822,6 +827,7 @@ bool needDeallocCacheEntry(CHIRequestType req_type) {
   return (dealloc_on_shared && ((req_type == CHIRequestType:ReadShared) ||
                                 (req_type == CHIRequestType:ReadNotSharedDirty))) ||
          (dealloc_on_unique && ((req_type == CHIRequestType:ReadUnique) ||
+                                (req_type == CHIRequestType:MakeReadUnique) ||
                                 (req_type == CHIRequestType:CleanUnique)));
 }
 
@@ -1183,6 +1189,7 @@ bool isReadReqType(CHIRequestType type) {
   if (type == CHIRequestType:Load ||
       type == CHIRequestType:ReadShared ||
       type == CHIRequestType:ReadNotSharedDirty ||
+      type == CHIRequestType:MakeReadUnique ||
       type == CHIRequestType:ReadOnce) {
     return true;
   }
@@ -1194,12 +1201,21 @@ bool isWriteReqType(CHIRequestType type) {
       type == CHIRequestType:StoreLine ||
       type == CHIRequestType:WriteUniquePtl ||
       type == CHIRequestType:WriteUniqueFull ||
+      type == CHIRequestType:WriteUniqueZero ||
       type == CHIRequestType:ReadUnique) {
     return true;
   }
   return false;
 }
 
+bool isStashReqType(CHIRequestType type) {
+  if (type == CHIRequestType:StashOnceShared ||
+      type == CHIRequestType:StashOnceUnique) {
+    return true;
+  }
+  return false;
+}
+
 ////////////////////////////////////////////////////////////////////////////
 // State->Event converters
 
@@ -1232,6 +1248,8 @@ Event reqToEvent(CHIRequestType type, bool is_prefetch) {
     return Event:CleanUnique;
   } else if (type == CHIRequestType:ReadOnce) {
     return Event:ReadOnce;
+  } else if (type == CHIRequestType:MakeReadUnique) {
+      return Event:MakeReadUnique;
   } else if (type == CHIRequestType:Evict) {
     return Event:Evict;
   } else if (type == CHIRequestType:WriteBackFull) {
@@ -1254,18 +1272,28 @@ Event reqToEvent(CHIRequestType type, bool is_prefetch) {
     } else {
       return Event:WriteUnique; // all WriteUnique handled the same when ~PoC
     }
+  } else if (type == CHIRequestType:WriteUniqueZero) {
+      return Event:WriteUniqueZero;
+  } else if (type == CHIRequestType:StashOnceShared) {
+      return Event:StashOnceShared;
+  } else if (type == CHIRequestType:StashOnceUnique) {
+      return Event:StashOnceUnique;
   } else if (type == CHIRequestType:DvmTlbi_Initiate) {
     return Event:DvmTlbi_Initiate;
   } else if (type == CHIRequestType:DvmSync_Initiate) {
     return Event:DvmSync_Initiate;
   } else if (type == CHIRequestType:AtomicReturn){
-    if (is_HN) {
+    if (is_HN && alloc_on_atomic) {
+      return Event:AtomicReturn_PoC_Alloc;
+    } else if (is_HN){
       return Event:AtomicReturn_PoC;
     } else {
       return Event:AtomicReturn;
     }
   } else if (type == CHIRequestType:AtomicNoReturn){
-    if (is_HN) {
+    if (is_HN && alloc_on_atomic) {
+      return Event:AtomicNoReturn_PoC_Alloc;
+    } else if (is_HN){
       return Event:AtomicNoReturn_PoC;
     } else {
       return Event:AtomicNoReturn;
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm b/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
index 1654b9bf02..2e19245d36 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Arm Limited
+ * Copyright (c) 2021-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -332,6 +332,29 @@ transition({SC_RSC, SD_RSC, RSC, SD_RSD, RSD}, ReadUnique, BUSY_BLKD) {
   ProcessNextState;
 }
 
+// MakeReadUnique
+transition({UD_RSC,UC_RSC,UD_RSD}, MakeReadUnique, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_MakeReadUnique_Hit_InvUpstream;
+  Profile_Hit;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({RUSD,RUSC,RSC,RSD,SC_RSC,SD_RSC,SD_RSD}, MakeReadUnique, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_MakeReadUnique_Hit_InvUpstream;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({BUSY_BLKD, BUSY_INTR}, FinishMakeReadUnique, BUSY_BLKD) {
+  Pop_TriggerQueue;
+  Finish_MakeReadUnique;
+  ProcessNextState_ClearPending;
+}
+
 // CleanUnique
 
 transition({I, SC, UC, SD, UD, RU, RSC, RSD, RUSD, RUSC,
@@ -351,6 +374,28 @@ transition({I, SC, UC, SD, UD, RU, RSC, RSD, RUSD, RUSC,
   ProcessNextState;
 }
 
+// WriteUniqueZero cacheline not present
+transition({I,RU,RSC,RSD,RUSD,RUSC},
+           WriteUniqueZero,
+           BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_WriteUnique_Zero;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+// WriteUniqueZero cacheline available
+transition({SC,UC,SD,UD,SC_RSC,UC_RSC,SD_RSC,UD_RSC,UC_RU,UD_RU,UD_RSD,SD_RSD},
+           WriteUniqueZero,
+           BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_WriteUnique_Zero;
+  Profile_Hit;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
 // WriteUniquePtl
 
 transition({UD,UD_RSD,UD_RSC,UC,UC_RSC},
@@ -426,7 +471,7 @@ transition({RSC,RSD,RUSD,RUSC,RU,I}, WriteUnique, BUSY_BLKD) {
 
 // AtomicReturn and AtomicNoReturn
 
-transition({I,SC,SC_RSC,SD,SD_RSD,SD_RSC,RSD,RUSD,
+transition({I,SC,SC_RSC,SD,SD_RSD,SD_RSC,RSD,RUSD,RUSC,
             UD,UD_RSC,UD_RSD,UD_RU,UC,UC_RSC,UC_RU,RSC,RU}, AtomicReturn, BUSY_BLKD) {
   Initiate_Request;
   Initiate_AtomicReturn_Forward;
@@ -435,7 +480,7 @@ transition({I,SC,SC_RSC,SD,SD_RSD,SD_RSC,RSD,RUSD,
   ProcessNextState;
 }
 
-transition({I,SC,SC_RSC,SD,SD_RSD,SD_RSC,RSD,RUSD,
+transition({I,SC,SC_RSC,SD,SD_RSD,SD_RSC,RSD,RUSD,RUSC,
             UD,UD_RSC,UD_RSD,UD_RU,UC,UC_RSC,UC_RU,RSC,RU}, AtomicNoReturn, BUSY_BLKD) {
   Initiate_Request;
   Initiate_AtomicNoReturn_Forward;
@@ -445,7 +490,7 @@ transition({I,SC,SC_RSC,SD,SD_RSD,SD_RSC,RSD,RUSD,
 }
 
 transition({UD,UD_RU,UD_RSD,UD_RSC,UC,UC_RU,UC_RSC},
-           AtomicReturn_PoC, BUSY_BLKD) {
+           {AtomicReturn_PoC_Alloc, AtomicReturn_PoC}, BUSY_BLKD) {
   Initiate_Request;
   Initiate_AtomicReturn_LocalWrite;
   Profile_Hit;
@@ -454,7 +499,7 @@ transition({UD,UD_RU,UD_RSD,UD_RSC,UC,UC_RU,UC_RSC},
 }
 
 transition({UD,UD_RU,UD_RSD,UD_RSC,UC,UC_RU,UC_RSC},
-           AtomicNoReturn_PoC, BUSY_BLKD) {
+           {AtomicNoReturn_PoC_Alloc, AtomicNoReturn_PoC}, BUSY_BLKD) {
   Initiate_Request;
   Initiate_AtomicNoReturn_LocalWrite;
   Profile_Hit;
@@ -462,8 +507,8 @@ transition({UD,UD_RU,UD_RSD,UD_RSC,UC,UC_RU,UC_RSC},
   ProcessNextState;
 }
 
-transition({SD, SD_RSD, SD_RSC, SC, SC_RSC, RSC, RSD, RUSC, RUSD, RU},
-           AtomicReturn_PoC, BUSY_BLKD) {
+transition({SD, SD_RSD, SD_RSC, SC, SC_RSC},
+           {AtomicReturn_PoC_Alloc, AtomicReturn_PoC}, BUSY_BLKD) {
   Initiate_Request;
   Initiate_AtomicReturn_LocalWrite;
   Profile_Miss;
@@ -471,8 +516,26 @@ transition({SD, SD_RSD, SD_RSC, SC, SC_RSC, RSC, RSD, RUSC, RUSD, RU},
   ProcessNextState;
 }
 
-transition({SD, SD_RSD, SD_RSC, SC, SC_RSC, RSC, RSD, RUSC, RUSD, RU},
-           AtomicNoReturn_PoC, BUSY_BLKD) {
+transition({RSC, RSD, RUSC, RUSD, RU},
+           AtomicReturn_PoC_Alloc, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_LocalWrite;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({RSC, RSD, RUSC, RUSD, RU},
+           AtomicReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_WriteBack;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({SD, SD_RSD, SD_RSC, SC, SC_RSC},
+           {AtomicNoReturn_PoC_Alloc, AtomicNoReturn_PoC}, BUSY_BLKD) {
   Initiate_Request;
   Initiate_AtomicNoReturn_LocalWrite;
   Profile_Miss;
@@ -480,6 +543,43 @@ transition({SD, SD_RSD, SD_RSC, SC, SC_RSC, RSC, RSD, RUSC, RUSD, RU},
   ProcessNextState;
 }
 
+transition({RSC, RSD, RUSC, RUSD, RU},
+           AtomicNoReturn_PoC_Alloc, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_LocalWrite;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({RSC, RSD, RUSC, RUSD, RU},
+           AtomicNoReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_WriteBack;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(I, AtomicReturn_PoC_Alloc, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_Miss_Alloc;
+  Allocate_DirEntry;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(I, AtomicNoReturn_PoC_Alloc, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_Miss_Alloc;
+  Allocate_DirEntry;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+
 transition(I, AtomicReturn_PoC, BUSY_BLKD) {
   Initiate_Request;
   Initiate_AtomicReturn_Miss;
@@ -498,7 +598,6 @@ transition(I, AtomicNoReturn_PoC, BUSY_BLKD) {
   ProcessNextState;
 }
 
-
 // Load / Store / Atomic from sequencer & Prefetch from prefetcher
 
 transition({UD,UD_T,SD,UC,SC}, Load, BUSY_BLKD) {
@@ -691,7 +790,7 @@ transition({UD_RSC, UC_RSC, SC_RSC, SD_RSC, UD, RU, RSD, RUSD, RUSC, UD_RSD, SD_
 // When in UD_RU,UC_RU,UD_RSD,SD_RSD we also just drop the line since an upstream
 // cache has an up-to-data line that it will either WriteBack or WriteEvict
 transition({SC,UC,SC_RSC,UC_RSC,
-            UD_RU,UC_RU,UD_RSD,SD_RSD}, LocalHN_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
+            UC_RU,UD_RSD,SD_RSD}, LocalHN_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
   Initiate_Replacement;
   Initiate_Replacement_JustDrop;
   Profile_Eviction;
@@ -700,7 +799,7 @@ transition({SC,UC,SC_RSC,UC_RSC,
   ProcessNextState;
 }
 
-transition({UD,SD,UD_RSC,SD_RSC}, LocalHN_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
+transition({UD,SD,UD_RU,UD_RSC,SD_RSC}, LocalHN_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
   Initiate_Replacement;
   Initiate_Replacement_WB;
   Profile_Eviction;
@@ -727,7 +826,7 @@ transition({UD,SD,UC}, Local_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
   ProcessNextState;
 }
 
-transition({UD_RU,UC_RU,UD_RSD,SD_RSD,SC_RSC,UC_RSC}, Local_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
+transition({UC_RU,UD_RSD,SD_RSD,SC_RSC,UC_RSC}, Local_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
   Initiate_Replacement;
   Initiate_Replacement_JustDrop;
   Profile_Eviction;
@@ -736,7 +835,7 @@ transition({UD_RU,UC_RU,UD_RSD,SD_RSD,SC_RSC,UC_RSC}, Local_Eviction, BUSY_BLKD)
   ProcessNextState;
 }
 
-transition({UD_RSC,SD_RSC}, Local_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
+transition({UD_RU, UD_RSC,SD_RSC}, Local_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
   Initiate_Replacement;
   Initiate_Replacement_WB;
   Profile_Eviction;
@@ -745,7 +844,7 @@ transition({UD_RSC,SD_RSC}, Local_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
   ProcessNextState;
 }
 
-transition({UD_RSC,SD_RSC,UC_RSC,UD_RU,UC_RU,UD_RSD}, Global_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
+transition({UD_RSC,SD_RSC,UC_RSC,UD_RU,UC_RU,UD_RSD,SD_RSD}, Global_Eviction, BUSY_BLKD) {ReplTBEAvailable} {
   Initiate_Replacement;
   Initiate_Replacement_WB_BackInvalidate;
   Profile_Eviction;
@@ -836,6 +935,14 @@ transition(BUSY_INTR, {SnpOnce,SnpOnceFwd}, BUSY_BLKD) {
   ProcessNextState;
 }
 
+// Stash
+transition({I,SC,UC,SD,UD,RU,RSC,RSD,RUSD,SC_RSC,UC_RSC,SD_RSC,UD_RSC,UC_RU,UD_RU,UD_RSD,SD_RSD,RUSC},
+           {StashOnceShared,StashOnceUnique}, BUSY_BLKD) {
+  Initiate_Request;
+  Send_CompI;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
 
 // Stalls
 
@@ -843,14 +950,17 @@ transition({BUSY_BLKD,BUSY_INTR},
             {ReadShared, ReadNotSharedDirty, ReadUnique, ReadUnique_PoC,
             ReadOnce, CleanUnique, CleanUnique_Stale,
             Load, Store, AtomicLoad, AtomicStore, Prefetch,
+            MakeReadUnique,
             WriteBackFull, WriteBackFull_Stale,
             WriteEvictFull, WriteEvictFull_Stale,
             WriteCleanFull, WriteCleanFull_Stale,
             Evict, Evict_Stale,
             WriteUnique,WriteUniquePtl_PoC,
-            WriteUniqueFull_PoC,WriteUniqueFull_PoC_Alloc
-            AtomicReturn,AtomicReturn_PoC,
-            AtomicNoReturn,AtomicNoReturn_PoC}) {
+            WriteUniqueFull_PoC,WriteUniqueFull_PoC_Alloc,
+            WriteUniqueZero,
+            AtomicReturn,AtomicReturn_PoC, AtomicReturn_PoC_Alloc
+            AtomicNoReturn,AtomicNoReturn_PoC, AtomicNoReturn_PoC_Alloc
+            StashOnceShared,StashOnceUnique}) {
   StallRequest;
 }
 
@@ -910,15 +1020,6 @@ transition(BUSY_BLKD, SendWriteUnique, BUSY_INTR) {DestinationAvailable} {
 transition(BUSY_BLKD, SendAtomicReturn, BUSY_INTR) {DestinationAvailable} {
   Pop_TriggerQueue;
   Send_AtomicReturn;
-  CheckARComp;
-  Profile_OutgoingStart;
-  ProcessNextState_ClearPending;
-}
-
-transition(BUSY_BLKD, SendAtomicReturn_NoWait, BUSY_INTR) {
-  Pop_TriggerQueue;
-  Send_AtomicReturn_NoWait;
-  CheckARComp;
   Profile_OutgoingStart;
   ProcessNextState_ClearPending;
 }
@@ -984,6 +1085,7 @@ transition(BUSY_BLKD, SendWUDataCB) {
 transition({BUSY_BLKD,BUSY_INTR}, SendARData) {
   Pop_TriggerQueue;
   Send_ARData;
+  CheckARComp;
   ProcessNextState_ClearPending;
 }
 
@@ -1073,6 +1175,12 @@ transition(BUSY_BLKD, SendCompUCRespStale) {
   ProcessNextState_ClearPending;
 }
 
+transition(BUSY_BLKD, SendCompUDResp) {
+  Pop_TriggerQueue;
+  Send_CompUD_PD;
+  ProcessNextState_ClearPending;
+}
+
 transition(BUSY_BLKD, SendRespSepData) {
   Pop_TriggerQueue;
   Send_RespSepData;
@@ -1111,6 +1219,13 @@ transition(BUSY_BLKD, SendReadUnique, BUSY_INTR) {DestinationAvailable} {
   ProcessNextState_ClearPending;
 }
 
+transition(BUSY_BLKD, SendMakeReadUnique, BUSY_INTR) {DestinationAvailable} {
+  Pop_TriggerQueue;
+  Send_MakeReadUnique;
+  Profile_OutgoingStart;
+  ProcessNextState_ClearPending;
+}
+
 transition(BUSY_BLKD, SendCleanUnique, BUSY_INTR) {DestinationAvailable} {
   Pop_TriggerQueue;
   Send_CleanUnique;
@@ -1203,6 +1318,12 @@ transition(BUSY_BLKD, SendCompDBIDResp_WU) {
   ProcessNextState_ClearPending;
 }
 
+transition(BUSY_BLKD, SendDBIDResp_WUZ) {
+  Pop_TriggerQueue;
+  Send_DBIDResp;
+  ProcessNextState_ClearPending;
+}
+
 transition(BUSY_BLKD, SendDBIDResp_WU) {
   Pop_TriggerQueue;
   ExpectNCBWrData;
@@ -1454,7 +1575,6 @@ transition({BUSY_INTR,BUSY_BLKD}, DBIDResp, BUSY_BLKD) {
   Receive_ReqResp;
   Receive_ReqResp_CopyDBID;
   Receive_ReqResp_WUNeedComp;
-  Receive_ReqResp_AR;
   Pop_RespInQueue;
   ProcessNextState;
 }
@@ -1471,6 +1591,12 @@ transition(BUSY_BLKD, TX_Data) {
   ProcessNextState_ClearPending;
 }
 
+transition(BUSY_INTR, WriteZero) {
+  Pop_TriggerQueue;
+  WriteZero;
+  ProcessNextState_ClearPending;
+}
+
 // Finalization transition
 
 transition({BUSY_BLKD,BUSY_INTR}, Final, *) {
diff --git a/src/mem/ruby/protocol/chi/CHI-cache.sm b/src/mem/ruby/protocol/chi/CHI-cache.sm
index 62fb2231ff..dcd142ea47 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache.sm
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 ARM Limited
+ * Copyright (c) 2021-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -135,7 +135,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
   // All Near executes all Atomics at L1 (variable set to 0; default)
   // Unique Near executes Atomics at HNF for states I, SC, SD (set to 1)
   // Present Near execites all Atomics at L1 except when state is I (set to 2)
-  int policy_type := 1;
+  int policy_type := 0;
 
 
   // Use separate Comp/DBIDResp responses for WriteUnique
@@ -313,6 +313,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     ReadUnique_PoC,              desc="", in_trans="yes";
     ReadOnce,                    desc="", in_trans="yes";
     CleanUnique,                 desc="", in_trans="yes";
+    MakeReadUnique,              desc="", in_trans="yes";
     Evict,                       desc="", in_trans="yes";
     WriteBackFull,               desc="", in_trans="yes";
     WriteEvictFull,              desc="", in_trans="yes";
@@ -321,10 +322,13 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     WriteUniquePtl_PoC,          desc="", in_trans="yes";
     WriteUniqueFull_PoC,         desc="", in_trans="yes";
     WriteUniqueFull_PoC_Alloc,   desc="", in_trans="yes";
+    WriteUniqueZero,             desc="", in_trans="yes";
     AtomicReturn,                desc="", in_trans="yes";
     AtomicNoReturn,              desc="", in_trans="yes";
     AtomicReturn_PoC,            desc="", in_trans="yes";
+    AtomicReturn_PoC_Alloc,      desc="", in_trans="yes";
     AtomicNoReturn_PoC,          desc="", in_trans="yes";
+    AtomicNoReturn_PoC_Alloc,    desc="", in_trans="yes";
     SnpCleanInvalid,             desc="", in_trans="yes";
     SnpShared,                   desc="", in_trans="yes";
     SnpSharedFwd,                desc="", in_trans="yes";
@@ -334,6 +338,8 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     SnpOnce,                     desc="", in_trans="yes";
     SnpOnceFwd,                  desc="", in_trans="yes";
     SnpStalled, desc="", in_trans="yes"; // A snoop stall triggered from the inport
+    StashOnceShared,             desc="", in_trans="yes"; // Just discarding the hint
+    StashOnceUnique,             desc="", in_trans="yes"; // Just discarding the hint
 
     // DVM sequencer requests
     DvmTlbi_Initiate, desc="", out_trans="yes", in_trans="yes"; // triggered when a CPU core wants to send a TLBI
@@ -459,6 +465,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     SendReadNoSnp,        out_trans="yes", desc="Send a SendReadNoSnp";
     SendReadNoSnpDMT,     out_trans="yes", desc="Send a SendReadNoSnp using DMT";
     SendReadUnique,       out_trans="yes", desc="Send a ReadUnique";
+    SendMakeReadUnique,   out_trans="yes", desc="Send a MakeReadUnique";
     SendCompAck,          desc="Send CompAck";
     // Read handling at the completer
     SendCompData,         desc="Send CompData";
@@ -480,10 +487,10 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     SendCompDBIDResp_WU,       desc="Ack WU with CompDBIDResp and set expected data";
     SendDBIDResp_WU,           desc="Ack WU with DBIDResp and set expected data";
     SendComp_WU,               desc="Ack WU completion";
+    SendDBIDResp_WUZ,          desc="Ack WUZ with DBIDResp and set expected data";
 
     // Send an atomic request downstream.
     SendAtomicReturn,          out_trans="yes", desc="Send atomic request with return";
-    SendAtomicReturn_NoWait,   out_trans="yes", desc="Send atomic request with return, but no DBID";
     SendAtomicNoReturn,        out_trans="yes", desc="Send atomic request without return";
     SendARData,                desc="Send atomic return request data";
     SendANRData,               desc="Send atomic no return request data";
@@ -494,13 +501,13 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     SendDBIDResp_ANR,      desc="Ack ANR with DBIDResp and set expected data";
     SendComp_ANR,          desc="Ack ANR completion";
 
-
     // Dataless requests
     SendEvict,      out_trans="yes", desc="Send a Evict";
     SendCompIResp,  desc="Ack Evict with Comp_I";
     SendCleanUnique,out_trans="yes", desc="Send a CleanUnique";
     SendCompUCResp, desc="Ack CleanUnique with Comp_UC";
     SendCompUCRespStale, desc="Ack stale CleanUnique with Comp_UC";
+    SendCompUDResp, desc="Ack MakeReadUnique with Comp_UD_PD";
 
     // Checks if an upgrade using a CleanUnique was sucessfull
     CheckUpgrade_FromStore, desc="Upgrade needed by a Store";
@@ -548,8 +555,10 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     TX_Data, desc="Transmit pending data messages";
     MaintainCoherence, desc="Queues a WriteBack or Evict before droping the only valid copy of the block";
     FinishCleanUnique, desc="Sends acks and perform any writeback after a CleanUnique";
+    FinishMakeReadUnique, desc="Return Comp or CompData to requestor at the end of MakeReadUnique";
     FinishCopyBack_Stale, desc="Check if a Evict needs to be sent";
     ActionStalledOnHazard, desc="Stall a trigger action because until finish handling snoop hazard";
+    WriteZero, desc="Stall a trigger action because until finish handling snoop hazard";
 
     // This is triggered once a transaction doesn't have
     // any queued action and is not expecting responses/data. The transaction
@@ -645,6 +654,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     int accSize,            desc="Access size for Load/Store/WriteUniquePtl; otherwisse == blockSize";
     CHIRequestType reqType, desc="Request type that initiated this transaction";
     Addr txnId,             desc="Transaction ID. We default to -1 for debug purposes", default="-1";
+    bool ns,                desc="Secure State of the transaction. NS=NonSecure", default="true";
     MachineID requestor,    desc="Requestor ID";
     MachineID fwdRequestor, desc="Requestor to receive data on fwding snoops";
     bool use_DMT,           desc="Use DMT for this transaction";
@@ -658,6 +668,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     // Atomic info associated with the transaction
     WriteMask atomic_op,    desc="Atomic Operation Wrapper";
     bool atomic_to_be_done, desc="We have yet to perform the atomic";
+    bool atomic_to_be_wb,   desc="We are writebacking the atomic";
 
     // NOTE: seqReq is a smart pointer pointing to original CPU request object
     // that triggers transactions associated with this TBE. seqReq carries some
diff --git a/src/mem/ruby/protocol/chi/CHI-msg.sm b/src/mem/ruby/protocol/chi/CHI-msg.sm
index b9e11d9dd9..0c2d542d77 100644
--- a/src/mem/ruby/protocol/chi/CHI-msg.sm
+++ b/src/mem/ruby/protocol/chi/CHI-msg.sm
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2023 Arm Limited
+ * Copyright (c) 2021, 2023-2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -59,6 +59,7 @@ enumeration(CHIRequestType, desc="") {
   ReadUnique;
   ReadOnce;
   CleanUnique;
+  MakeReadUnique;
 
   Evict;
 
@@ -67,6 +68,7 @@ enumeration(CHIRequestType, desc="") {
   WriteEvictFull;
   WriteUniquePtl;
   WriteUniqueFull;
+  WriteUniqueZero;
 
   AtomicReturn;
   AtomicNoReturn;
@@ -89,6 +91,9 @@ enumeration(CHIRequestType, desc="") {
   ReadNoSnp;
   ReadNoSnpSep;
 
+  StashOnceShared;
+  StashOnceUnique;
+
   DvmOpNonSync;
   DvmOpSync;
 
@@ -117,6 +122,7 @@ structure(CHIRequestMsg, desc="", interface="Message") {
 
   bool usesTxnId,       desc="True if using a Transaction ID", default="false";
   Addr txnId,           desc="Transaction ID", default="0";
+  bool ns,              desc="Secure State of the transaction. NS=NonSecure", default="true";
 
   MessageSizeType MessageSize, default="MessageSizeType_Control";
 
@@ -130,6 +136,7 @@ enumeration(CHIResponseType, desc="...") {
   // CHI response types
   Comp_I;
   Comp_UC;
+  Comp_UD_PD;
   Comp_SC;
   CompAck;
   CompDBIDResp;
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh
index 1e9674b9f5..a258a18f9a 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 ARM Limited
+ * Copyright (c) 2020-2021, 2024 Arm Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -84,6 +84,7 @@ class RubyRequest : public Message
     // explicitly set to true in the program in order to bypass caches
     bool m_isGLCSet;
     bool m_isSLCSet;
+    bool m_isSecure;
 
     RubyRequest(Tick curTime, uint64_t _paddr, int _len,
         uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
@@ -101,7 +102,8 @@ class RubyRequest : public Message
           m_htmFromTransaction(false),
           m_htmTransactionUid(0),
           m_isTlbi(false),
-          m_tlbiTransactionUid(0)
+          m_tlbiTransactionUid(0),
+          m_isSecure(m_pkt ? m_pkt->req->isSecure() : false)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
         if (_pkt) {
@@ -129,7 +131,8 @@ class RubyRequest : public Message
           m_htmFromTransaction(false),
           m_htmTransactionUid(0),
           m_isTlbi(false),
-          m_tlbiTransactionUid(0)
+          m_tlbiTransactionUid(0),
+          m_isSecure(m_pkt->req->isSecure())
     {
         assert(m_pkt->req->isMemMgmt());
         if (_pkt) {
@@ -164,7 +167,8 @@ class RubyRequest : public Message
           m_htmFromTransaction(false),
           m_htmTransactionUid(0),
           m_isTlbi(false),
-          m_tlbiTransactionUid(0)
+          m_tlbiTransactionUid(0),
+          m_isSecure(m_pkt->req->isSecure())
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
         if (_pkt) {
@@ -200,7 +204,8 @@ class RubyRequest : public Message
           m_htmFromTransaction(false),
           m_htmTransactionUid(0),
           m_isTlbi(false),
-          m_tlbiTransactionUid(0)
+          m_tlbiTransactionUid(0),
+          m_isSecure(m_pkt->req->isSecure())
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
         if (_pkt) {
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index 90d6031c6e..5ee4105597 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -669,14 +669,14 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
-    // all packets must have valid instruction sequence numbers
-    assert(pkt->req->hasInstSeqNum());
-
     if (pkt->cmd == MemCmd::MemSyncReq) {
         // issue mem_sync requests immediately to the cache system without
         // going through uncoalescedTable like normal LD/ST/Atomic requests
         issueMemSyncRequest(pkt);
     } else {
+        // all packets must have valid instruction sequence numbers
+        assert(pkt->req->hasInstSeqNum());
+
         // otherwise, this must be either read or write command
         assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());
 
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index 109fd43051..21062eac14 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -454,6 +454,9 @@ RubySystem::startup()
         Tick curtick_original = curTick();
         // save the event queue head
         Event* eventq_head = eventq->replaceHead(NULL);
+        // save the exit event pointer
+        GlobalSimLoopExitEvent *original_simulate_limit_event = nullptr;
+        original_simulate_limit_event = simulate_limit_event;
         // set curTick to 0 and reset Ruby System's clock
         setCurTick(0);
         resetClock();
@@ -471,6 +474,8 @@ RubySystem::startup()
 
         // Restore eventq head
         eventq->replaceHead(eventq_head);
+        // Restore exit event pointer
+        simulate_limit_event = original_simulate_limit_event;
         // Restore curTick and Ruby System's clock
         setCurTick(curtick_original);
         resetClock();
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 48054febef..4fef7090b6 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -85,6 +85,8 @@ Sequencer::Sequencer(const Params &p)
 
     m_runningGarnetStandalone = p.garnet_standalone;
 
+    m_num_pending_invs = 0;
+    m_cache_inv_pkt = nullptr;
 
     // These statistical variables are not for display.
     // The profiler will collate these across different
@@ -348,6 +350,15 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type,
         return RequestStatus_Ready;
     }
 
+    // If command is MemSyncReq, it is used to invalidate the cache.
+    // As the cache invalidation requests are already issued in invL1(),
+    // there is no need to create a new request for the same here.
+    // Instead, return RequestStatus_Aliased, and make the sequencer skip
+    // an extra issueRequest
+    if (pkt->cmd == MemCmd::MemSyncReq) {
+        return RequestStatus_Aliased;
+    }
+
     Addr line_addr = makeLineAddress(pkt->getAddr());
     // Check if there is any outstanding request for the same cache line.
     auto &seq_req_list = m_RequestTable[line_addr];
@@ -576,7 +587,8 @@ Sequencer::readCallback(Addr address, DataBlock& data,
         }
         if ((seq_req.m_type != RubyRequestType_LD) &&
             (seq_req.m_type != RubyRequestType_Load_Linked) &&
-            (seq_req.m_type != RubyRequestType_IFETCH)) {
+            (seq_req.m_type != RubyRequestType_IFETCH) &&
+            (seq_req.m_type != RubyRequestType_REPLACEMENT)) {
             // Write request: reissue request to the cache hierarchy
             issueRequest(seq_req.pkt, seq_req.m_second_type);
             break;
@@ -811,6 +823,86 @@ Sequencer::unaddressedCallback(Addr unaddressedReqId,
     }
 }
 
+void
+Sequencer::completeHitCallback(std::vector<PacketPtr> & mylist)
+{
+    for (auto& pkt : mylist) {
+        // When Ruby is in warmup or cooldown phase, the requests come
+        // from the cache recorder. They do not track which port to use
+        // and do not need to send the response back
+        if (!RubySystem::getWarmupEnabled()
+                && !RubySystem::getCooldownEnabled()) {
+            RubyPort::SenderState *ss =
+                safe_cast<RubyPort::SenderState *>(pkt->senderState);
+            MemResponsePort *port = ss->port;
+            assert(port != NULL);
+
+            pkt->senderState = ss->predecessor;
+
+            if (pkt->cmd != MemCmd::WriteReq) {
+                // for WriteReq, we keep the original senderState until
+                // writeCompleteCallback
+                delete ss;
+            }
+
+            port->hitCallback(pkt);
+            trySendRetries();
+        }
+    }
+
+    RubySystem *rs = m_ruby_system;
+    if (RubySystem::getWarmupEnabled()) {
+        rs->m_cache_recorder->enqueueNextFetchRequest();
+    } else if (RubySystem::getCooldownEnabled()) {
+        rs->m_cache_recorder->enqueueNextFlushRequest();
+    } else {
+        testDrainComplete();
+    }
+}
+
+void
+Sequencer::invL1Callback()
+{
+    // Since L1 invalidate is currently done with paddr = 0
+    assert(m_cache_inv_pkt && m_num_pending_invs > 0);
+
+    m_num_pending_invs--;
+
+    if (m_num_pending_invs == 0) {
+        std::vector<PacketPtr> pkt_list { m_cache_inv_pkt };
+        m_cache_inv_pkt = nullptr;
+        completeHitCallback(pkt_list);
+    }
+}
+
+void
+Sequencer::invL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    DPRINTF(RubySequencer,
+            "There are %d Invalidations outstanding before Cache Walk\n",
+            m_num_pending_invs);
+    // Walk the cache
+    for (int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Evict Read-only data
+        RubyRequestType request_type = RubyRequestType_REPLACEMENT;
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, 0, 0,
+            request_type, RubyAccessMode_Supervisor,
+            nullptr);
+        DPRINTF(RubySequencer, "Evicting addr 0x%x\n", addr);
+        assert(m_mandatory_q_ptr != NULL);
+        Tick latency = cyclesToTicks(
+            m_controller->mandatoryQueueLatency(request_type));
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+        m_num_pending_invs++;
+    }
+    DPRINTF(RubySequencer,
+            "There are %d Invalidations outstanding after Cache Walk\n",
+            m_num_pending_invs);
+}
+
 bool
 Sequencer::empty() const
 {
@@ -915,6 +1007,11 @@ Sequencer::makeRequest(PacketPtr pkt)
             }
         } else if (pkt->isFlush()) {
           primary_type = secondary_type = RubyRequestType_FLUSH;
+        } else if (pkt->cmd == MemCmd::MemSyncReq) {
+            primary_type = secondary_type = RubyRequestType_REPLACEMENT;
+            assert(!m_cache_inv_pkt);
+            m_cache_inv_pkt = pkt;
+            invL1();
         } else {
             panic("Unsupported ruby packet type\n");
         }
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 8f736da6d5..3dc61ab4fa 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -141,6 +141,10 @@ class Sequencer : public RubyPort
                              const Cycles forwardRequestTime = Cycles(0),
                              const Cycles firstResponseTime = Cycles(0));
 
+    void completeHitCallback(std::vector<PacketPtr>& list);
+    void invL1Callback();
+    void invL1();
+
     RequestStatus makeRequest(PacketPtr pkt) override;
     virtual bool empty() const;
     int outstandingCount() const override { return m_outstanding_count; }
@@ -243,6 +247,10 @@ class Sequencer : public RubyPort
   private:
     int m_max_outstanding_requests;
 
+    int m_num_pending_invs;
+
+    PacketPtr m_cache_inv_pkt;
+
     CacheMemory* m_dataCache_ptr;
 
     // The cache access latency for top-level caches (L0/L1). These are
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index a5198cce63..47ceced3a7 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -77,9 +77,10 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
     //    AtomicOp            : cache atomic
     //    Flush               : flush and invalidate cache
     //
-    // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit
+    // VIPER does not expect MemSyncReq & Release since compute unit
     // does not specify an equivalent type of memory request.
     assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
+           (pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL2()) ||
             pkt->cmd == MemCmd::ReadReq ||
             pkt->cmd == MemCmd::WriteReq ||
             pkt->cmd == MemCmd::FlushReq ||
@@ -106,6 +107,10 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
         invTCP();
     }
 
+    if (pkt->req->isInvL2()) {
+        invTCC(pkt);
+    }
+
     return RequestStatus_Issued;
 }
 
@@ -306,5 +311,51 @@ VIPERCoalescer::invTCP()
             m_num_pending_invs);
 }
 
+void
+VIPERCoalescer::invTCCCallback(Addr addr)
+{
+    for (auto& pkt : m_pending_invl2s[addr]) {
+        RubyPort::SenderState *ss =
+            safe_cast<RubyPort::SenderState *>(pkt->senderState);
+        MemResponsePort *port = ss->port;
+        assert(port != nullptr);
+
+        // Now convert to MemSyncResp
+        pkt->makeResponse();
+
+        pkt->senderState = ss->predecessor;
+        delete ss;
+        port->hitCallback(pkt);
+    }
+    m_pending_invl2s.erase(addr);
+}
+
+/*
+ * Send an invalidate to a specific address in the TCC.
+ */
+void
+VIPERCoalescer::invTCC(PacketPtr pkt)
+{
+    assert(pkt);
+    assert(pkt->req);
+
+    Addr addr = pkt->req->getPaddr();
+    RubyRequestType request_type = RubyRequestType_InvL2;
+
+    std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+        clockEdge(), addr, 0, 0,
+        request_type, RubyAccessMode_Supervisor,
+        nullptr);
+
+    DPRINTF(GPUCoalescer, "Sending L2 invalidate to 0x%x\n", addr);
+
+    assert(m_mandatory_q_ptr);
+    Tick latency = cyclesToTicks(
+        m_controller->mandatoryQueueLatency(request_type));
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+
+    m_pending_invl2s[addr].push_back(pkt);
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
index c7e21e946b..3054fc399c 100644
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -63,11 +63,13 @@ class VIPERCoalescer : public GPUCoalescer
     ~VIPERCoalescer();
     void writeCompleteCallback(Addr address, uint64_t instSeqNum);
     void invTCPCallback(Addr address);
+    void invTCCCallback(Addr address);
     RequestStatus makeRequest(PacketPtr pkt) override;
     void issueRequest(CoalescedRequest* crequest) override;
 
   private:
     void invTCP();
+    void invTCC(PacketPtr pkt);
 
     // make write-complete response packets from original write request packets
     void makeWriteCompletePkts(CoalescedRequest* crequest);
@@ -79,6 +81,9 @@ class VIPERCoalescer : public GPUCoalescer
     // number of remaining cache lines to be invalidated in TCP
     int m_num_pending_invs;
 
+    // outstanding L2 invalidate packets
+    std::unordered_map<Addr, std::vector<PacketPtr>> m_pending_invl2s;
+
     // a map of instruction sequence number and corresponding pending
     // write-complete response packets. Each write-complete response
     // corresponds to a pending store request that is waiting for
diff --git a/src/mem/slicc/ast/OperatorExprAST.py b/src/mem/slicc/ast/OperatorExprAST.py
index 87417b50ea..c024c3071f 100644
--- a/src/mem/slicc/ast/OperatorExprAST.py
+++ b/src/mem/slicc/ast/OperatorExprAST.py
@@ -62,7 +62,15 @@ class InfixOperatorExprAST(ExprAST):
         # Figure out what the input and output types should be
         if self.op in ("==", "!=", ">=", "<=", ">", "<"):
             output = "bool"
-            if ltype != rtype:
+
+            if (
+                str(ltype) == "Addr"
+                and str(rtype) == "int"
+                or str(ltype) == "int"
+                and str(rtype) == "Addr"
+            ):
+                pass
+            elif ltype != rtype:
                 self.error(
                     "Type mismatch: left and right operands of "
                     + "operator '%s' must be the same type. "
diff --git a/src/mem/translating_port_proxy.cc b/src/mem/translating_port_proxy.cc
index 8daa390d80..68de8dbb7a 100644
--- a/src/mem/translating_port_proxy.cc
+++ b/src/mem/translating_port_proxy.cc
@@ -92,7 +92,7 @@ TranslatingPortProxy::tryReadBlob(Addr addr, void *p, uint64_t size) const
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
             addr, size, _tc, mode, flags),
         [this, &p](const auto &range) {
-            PortProxy::readBlobPhys(range.paddr, flags, p, range.size);
+            PortProxy::readBlobPhys(range.paddr, range.flags, p, range.size);
             p = static_cast<uint8_t *>(p) + range.size;
     });
 }
@@ -105,7 +105,7 @@ TranslatingPortProxy::tryWriteBlob(
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
             addr, size, _tc, mode, flags),
         [this, &p](const auto &range) {
-            PortProxy::writeBlobPhys(range.paddr, flags, p, range.size);
+            PortProxy::writeBlobPhys(range.paddr, range.flags, p, range.size);
             p = static_cast<const uint8_t *>(p) + range.size;
     });
 }
@@ -117,7 +117,7 @@ TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, uint64_t size) const
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
             addr, size, _tc, mode, flags),
         [this, v](const auto &range) {
-            PortProxy::memsetBlobPhys(range.paddr, flags, v, range.size);
+            PortProxy::memsetBlobPhys(range.paddr, range.flags, v, range.size);
     });
 }
 
diff --git a/src/mem/translation_gen.hh b/src/mem/translation_gen.hh
index ed5960aba2..b23f03114a 100644
--- a/src/mem/translation_gen.hh
+++ b/src/mem/translation_gen.hh
@@ -33,6 +33,7 @@
 
 #include "base/logging.hh"
 #include "base/types.hh"
+#include "mem/request.hh"
 
 namespace gem5
 {
@@ -71,6 +72,8 @@ class TranslationGen
         Addr size = 0;
 
         Addr paddr = 0;
+        // PTEs can also set the Secure/non-secure bit, so it is stored here.
+        Request::Flags flags = 0;
         Fault fault = NoFault;
     };
 
diff --git a/src/mem/translation_gen.test.cc b/src/mem/translation_gen.test.cc
index 276e24027a..f4d81c895e 100644
--- a/src/mem/translation_gen.test.cc
+++ b/src/mem/translation_gen.test.cc
@@ -123,6 +123,7 @@ class TestTranslationGen : public TranslationGen
                 // If there wasn't a fault, size and paddr are meaningful.
                 range.size = resultPos->size;
                 range.paddr = resultPos->paddr;
+                range.flags = resultPos->flags;
             }
             // Advance to the next result.
             resultPos++;
@@ -161,8 +162,8 @@ TEST(TranslationGen, SuccessfulTwoStep)
 {
     TestTranslationGen gen(0x10000, 0x10000, {
             // Results for translate.
-            {0x0, 0x8000, 0x30000, NoFault},
-            {0x0, 0x8000, 0x40000, NoFault}
+            {0x0, 0x8000, 0x30000, {}, NoFault},
+            {0x0, 0x8000, 0x40000, {}, NoFault}
     });
 
     RangeList range_list;
@@ -171,15 +172,15 @@ TEST(TranslationGen, SuccessfulTwoStep)
 
     // What the generator should return.
     const RangeList expected_gen{
-        {0x10000, 0x8000, 0x30000, NoFault},
-        {0x18000, 0x8000, 0x40000, NoFault}
+        {0x10000, 0x8000, 0x30000, {}, NoFault},
+        {0x18000, 0x8000, 0x40000, {}, NoFault}
     };
     EXPECT_THAT(range_list, Pointwise(GenRangeEq(), expected_gen));
 
     // What the generator should have been asked to translate.
     const RangeList expected_trans{
-        {0x10000, 0x10000, 0x0, NoFault},
-        {0x18000, 0x8000, 0x0, NoFault}
+        {0x10000, 0x10000, 0x0, {}, NoFault},
+        {0x18000, 0x8000, 0x0, {}, NoFault}
     };
     EXPECT_THAT(gen.args, Pointwise(TransRangeEq(), expected_trans));
 }
@@ -188,9 +189,9 @@ TEST(TranslationGen, RetryOnFault)
 {
     TestTranslationGen gen(0x10000, 0x10000, {
             // Results for translate.
-            {0x0, 0x8000, 0x30000, NoFault},
-            {0x0, 0x0, 0x0, dummyFault1},
-            {0x0, 0x8000, 0x40000, NoFault}
+            {0x0, 0x8000, 0x30000, {}, NoFault},
+            {0x0, 0x0, 0x0, {}, dummyFault1},
+            {0x0, 0x8000, 0x40000, {}, NoFault}
     });
 
     RangeList range_list;
@@ -199,17 +200,17 @@ TEST(TranslationGen, RetryOnFault)
 
     // What the generator should return.
     const RangeList expected_gen{
-        {0x10000, 0x8000, 0x30000, NoFault},
-        {0x18000, 0x0, 0x0, dummyFault1},
-        {0x18000, 0x8000, 0x40000, NoFault}
+        {0x10000, 0x8000, 0x30000, {}, NoFault},
+        {0x18000, 0x0, 0x0, {}, dummyFault1},
+        {0x18000, 0x8000, 0x40000, {}, NoFault}
     };
     EXPECT_THAT(range_list, Pointwise(GenRangeEq(), expected_gen));
 
     // What the generator should have been asked to translate.
     const RangeList expected_trans{
-        {0x10000, 0x10000, 0x0, NoFault},
-        {0x18000, 0x8000, 0x0, NoFault},
-        {0x18000, 0x8000, 0x0, NoFault}
+        {0x10000, 0x10000, 0x0, {}, NoFault},
+        {0x18000, 0x8000, 0x0, {}, NoFault},
+        {0x18000, 0x8000, 0x0, {}, NoFault}
     };
     EXPECT_THAT(gen.args, Pointwise(TransRangeEq(), expected_trans));
 }
@@ -218,10 +219,10 @@ TEST(TranslationGen, RetryTwiceOnFault)
 {
     TestTranslationGen gen(0x10000, 0x10000, {
             // Results for translate.
-            {0x0, 0x8000, 0x30000, NoFault},
-            {0x0, 0x0, 0x0, dummyFault1},
-            {0x0, 0x0, 0x0, dummyFault2},
-            {0x0, 0x8000, 0x40000, NoFault}
+            {0x0, 0x8000, 0x30000, {}, NoFault},
+            {0x0, 0x0, 0x0, {}, dummyFault1},
+            {0x0, 0x0, 0x0, {}, dummyFault2},
+            {0x0, 0x8000, 0x40000, {}, NoFault}
     });
 
     RangeList range_list;
@@ -230,19 +231,19 @@ TEST(TranslationGen, RetryTwiceOnFault)
 
     // What the generator should return.
     const RangeList expected_gen{
-        {0x10000, 0x8000, 0x30000, NoFault},
-        {0x18000, 0x0, 0x0, dummyFault1},
-        {0x18000, 0x0, 0x0, dummyFault2},
-        {0x18000, 0x8000, 0x40000, NoFault}
+        {0x10000, 0x8000, 0x30000, {}, NoFault},
+        {0x18000, 0x0, 0x0, {}, dummyFault1},
+        {0x18000, 0x0, 0x0, {}, dummyFault2},
+        {0x18000, 0x8000, 0x40000, {}, NoFault}
     };
     EXPECT_THAT(range_list, Pointwise(GenRangeEq(), expected_gen));
 
     // What the generator should have been asked to translate.
     const RangeList expected_trans{
-        {0x10000, 0x10000, 0x0, NoFault},
-        {0x18000, 0x8000, 0x0, NoFault},
-        {0x18000, 0x8000, 0x0, NoFault},
-        {0x18000, 0x8000, 0x0, NoFault}
+        {0x10000, 0x10000, 0x0, {}, NoFault},
+        {0x18000, 0x8000, 0x0, {}, NoFault},
+        {0x18000, 0x8000, 0x0, {}, NoFault},
+        {0x18000, 0x8000, 0x0, {}, NoFault}
     };
     EXPECT_THAT(gen.args, Pointwise(TransRangeEq(), expected_trans));
 }
@@ -251,9 +252,9 @@ TEST(TranslationGen, FaultAtStart)
 {
     TestTranslationGen gen(0x10000, 0x10000, {
             // Results for translate.
-            {0x0, 0x0, 0x0, dummyFault1},
-            {0x0, 0x8000, 0x30000, NoFault},
-            {0x0, 0x8000, 0x40000, NoFault}
+            {0x0, 0x0, 0x0, {}, dummyFault1},
+            {0x0, 0x8000, 0x30000, {}, NoFault},
+            {0x0, 0x8000, 0x40000, {}, NoFault}
     });
 
     RangeList range_list;
@@ -262,17 +263,17 @@ TEST(TranslationGen, FaultAtStart)
 
     // What the generator should return.
     const RangeList expected_gen{
-        {0x10000, 0x0, 0x0, dummyFault1},
-        {0x10000, 0x8000, 0x30000, NoFault},
-        {0x18000, 0x8000, 0x40000, NoFault}
+        {0x10000, 0x0, 0x0, {}, dummyFault1},
+        {0x10000, 0x8000, 0x30000, {}, NoFault},
+        {0x18000, 0x8000, 0x40000, {}, NoFault}
     };
     EXPECT_THAT(range_list, Pointwise(GenRangeEq(), expected_gen));
 
     // What the generator should have been asked to translate.
     const RangeList expected_trans{
-        {0x10000, 0x10000, 0x0, NoFault},
-        {0x10000, 0x10000, 0x0, NoFault},
-        {0x18000, 0x8000, 0x0, NoFault}
+        {0x10000, 0x10000, 0x0, {}, NoFault},
+        {0x10000, 0x10000, 0x0, {}, NoFault},
+        {0x18000, 0x8000, 0x0, {}, NoFault}
     };
     EXPECT_THAT(gen.args, Pointwise(TransRangeEq(), expected_trans));
 }
@@ -281,10 +282,10 @@ TEST(TranslationGen, FaultInMiddle)
 {
     TestTranslationGen gen(0x10000, 0x18000, {
             // Results for translate.
-            {0x0, 0x8000, 0x30000, NoFault},
-            {0x0, 0x0, 0x0, dummyFault1},
-            {0x0, 0x8000, 0x40000, NoFault},
-            {0x0, 0x8000, 0x50000, NoFault}
+            {0x0, 0x8000, 0x30000, {}, NoFault},
+            {0x0, 0x0, 0x0, {}, dummyFault1},
+            {0x0, 0x8000, 0x40000, {}, NoFault},
+            {0x0, 0x8000, 0x50000, {}, NoFault}
     });
 
     RangeList range_list;
@@ -293,19 +294,19 @@ TEST(TranslationGen, FaultInMiddle)
 
     // What the generator should return.
     const RangeList expected_gen{
-        {0x10000, 0x8000, 0x30000, NoFault},
-        {0x18000, 0x0, 0x0, dummyFault1},
-        {0x18000, 0x8000, 0x40000, NoFault},
-        {0x20000, 0x8000, 0x50000, NoFault}
+        {0x10000, 0x8000, 0x30000, {}, NoFault},
+        {0x18000, 0x0, 0x0, {}, dummyFault1},
+        {0x18000, 0x8000, 0x40000, {}, NoFault},
+        {0x20000, 0x8000, 0x50000, {}, NoFault}
     };
     EXPECT_THAT(range_list, Pointwise(GenRangeEq(), expected_gen));
 
     // What the generator should have been asked to translate.
     const RangeList expected_trans{
-        {0x10000, 0x18000, 0x0, NoFault},
-        {0x18000, 0x10000, 0x0, NoFault},
-        {0x18000, 0x10000, 0x0, NoFault},
-        {0x20000, 0x8000, 0x0, NoFault}
+        {0x10000, 0x18000, 0x0, {}, NoFault},
+        {0x18000, 0x10000, 0x0, {}, NoFault},
+        {0x18000, 0x10000, 0x0, {}, NoFault},
+        {0x20000, 0x8000, 0x0, {}, NoFault}
     };
     EXPECT_THAT(gen.args, Pointwise(TransRangeEq(), expected_trans));
 }
@@ -314,9 +315,9 @@ TEST(TranslationGen, VariablePageSize)
 {
     TestTranslationGen gen(0x10000, 0x20000, {
             // Results for translate.
-            {0x0, 0x8000, 0x30000, NoFault},
-            {0x0, 0x10000, 0x40000, NoFault},
-            {0x0, 0x8000, 0x50000, NoFault}
+            {0x0, 0x8000, 0x30000, {}, NoFault},
+            {0x0, 0x10000, 0x40000, {}, NoFault},
+            {0x0, 0x8000, 0x50000, {}, NoFault}
     });
 
     RangeList range_list;
@@ -325,17 +326,17 @@ TEST(TranslationGen, VariablePageSize)
 
     // What the generator should return.
     const RangeList expected_gen{
-        {0x10000, 0x8000, 0x30000, NoFault},
-        {0x18000, 0x10000, 0x40000, NoFault},
-        {0x28000, 0x8000, 0x50000, NoFault}
+        {0x10000, 0x8000, 0x30000, {}, NoFault},
+        {0x18000, 0x10000, 0x40000, {}, NoFault},
+        {0x28000, 0x8000, 0x50000, {}, NoFault}
     };
     EXPECT_THAT(range_list, Pointwise(GenRangeEq(), expected_gen));
 
     // What the generator should have been asked to translate.
     const RangeList expected_trans{
-        {0x10000, 0x20000, 0x0, NoFault},
-        {0x18000, 0x18000, 0x0, NoFault},
-        {0x28000, 0x8000, 0x0, NoFault}
+        {0x10000, 0x20000, 0x0, {}, NoFault},
+        {0x18000, 0x18000, 0x0, {}, NoFault},
+        {0x28000, 0x8000, 0x0, {}, NoFault}
     };
     EXPECT_THAT(gen.args, Pointwise(TransRangeEq(), expected_trans));
 }
diff --git a/src/python/SConscript b/src/python/SConscript
index eaaea203f0..3aed9f03e3 100644
--- a/src/python/SConscript
+++ b/src/python/SConscript
@@ -91,6 +91,9 @@ PySource('gem5.components.cachehierarchies.classic',
 PySource('gem5.components.cachehierarchies.classic',
     'gem5/components/cachehierarchies/classic/'
     'private_l1_private_l2_cache_hierarchy.py')
+PySource('gem5.components.cachehierarchies.classic',
+    'gem5/components/cachehierarchies/classic/'
+    'private_l1_private_l2_walk_cache_hierarchy.py')
 PySource('gem5.components.cachehierarchies.classic',
     'gem5/components/cachehierarchies/classic/'
     'private_l1_shared_l2_cache_hierarchy.py')
@@ -241,10 +244,22 @@ PySource('gem5.components.processors',
     'gem5/components/processors/linear_generator_core.py')
 PySource('gem5.components.processors',
     'gem5/components/processors/linear_generator.py')
+PySource('gem5.components.processors',
+    'gem5/components/processors/strided_generator_core.py')
+PySource('gem5.components.processors',
+    'gem5/components/processors/strided_generator.py')
 PySource('gem5.components.processors',
     'gem5/components/processors/random_generator_core.py')
 PySource('gem5.components.processors',
     'gem5/components/processors/random_generator.py')
+PySource('gem5.components.processors.spatter_gen',
+         'gem5/components/processors/spatter_gen/__init__.py')
+PySource('gem5.components.processors.spatter_gen',
+    'gem5/components/processors/spatter_gen/spatter_generator_core.py')
+PySource('gem5.components.processors.spatter_gen',
+    'gem5/components/processors/spatter_gen/spatter_generator.py')
+PySource('gem5.components.processors.spatter_gen',
+    'gem5/components/processors/spatter_gen/spatter_kernel.py')
 PySource('gem5.components.processors',
     'gem5/components/processors/simple_core.py')
 PySource('gem5.components.processors',
@@ -289,16 +304,21 @@ PySource('gem5.resources.client_api',
          'gem5/resources/client_api/jsonclient.py')
 PySource('gem5.resources.client_api',
          'gem5/resources/client_api/atlasclient.py')
-PySource('gem5.resources.client_api',
-         'gem5/resources/client_api/client_wrapper.py')
 PySource('gem5.resources.client_api',
          'gem5/resources/client_api/abstract_client.py')
+PySource('gem5.resources.client_api',
+            'gem5/resources/client_api/client_query.py')
 PySource('gem5', 'gem5_default_config.py')
 PySource('gem5.utils', 'gem5/utils/__init__.py')
 PySource('gem5.utils', 'gem5/utils/filelock.py')
 PySource('gem5.utils', 'gem5/utils/override.py')
 PySource('gem5.utils', 'gem5/utils/progress_bar.py')
 PySource('gem5.utils', 'gem5/utils/requires.py')
+PySource('gem5.utils',
+         'gem5/utils/socks_ssl_context.py')
+PySource('gem5.utils.multisim', 'gem5/utils/multisim/__init__.py')
+PySource('gem5.utils.multisim', 'gem5/utils/multisim/multisim.py')
+PySource('gem5.utils.multisim', 'gem5/utils/multisim/__main__.py')
 PySource('gem5.utils.multiprocessing',
     'gem5/utils/multiprocessing/__init__.py')
 PySource('gem5.utils.multiprocessing',
diff --git a/src/python/gem5/components/boards/arm_board.py b/src/python/gem5/components/boards/arm_board.py
index c60761c16d..0a0cd2fa28 100644
--- a/src/python/gem5/components/boards/arm_board.py
+++ b/src/python/gem5/components/boards/arm_board.py
@@ -42,6 +42,7 @@ from m5.objects import (
     BadAddr,
     Bridge,
     CowDiskImage,
+    GenericTimer,
     IOXBar,
     PciVirtIO,
     Port,
@@ -51,6 +52,7 @@ from m5.objects import (
     Terminal,
     VExpress_GEM5_Base,
     VExpress_GEM5_Foundation,
+    VExpress_GEM5_V1,
     VirtIOBlock,
     VncServer,
     VoltageDomain,
@@ -80,6 +82,7 @@ class ArmBoard(ArmSystem, AbstractBoard, KernelDiskWorkload):
 
     **Limitations**
     * stage2 walker ports are ignored.
+    * KVM cores only work with VExpress_GEM5_V1
     """
 
     __metaclass__ = ABCMeta
@@ -211,6 +214,17 @@ class ArmBoard(ArmSystem, AbstractBoard, KernelDiskWorkload):
         if hasattr(self.realview.gic, "cpu_addr"):
             self.gic_cpu_addr = self.realview.gic.cpu_addr
 
+        # For KVM cpus, we need to simulate the GIC.
+        if any(core.is_kvm_core() for core in self.processor.get_cores()):
+            # The following is taken from
+            # `tests/fs/linux/arm/configs/arm_generic.py`:
+            # Arm KVM regressions will use a simulated GIC. This means that in
+            # order to work we need to remove the system interface of the
+            # generic timer from the DTB and we need to inform the MuxingKvmGic
+            # class to use the gem5 GIC instead of relying on the host one
+            GenericTimer.generateDeviceTree = SimObject.generateDeviceTree
+            self.realview.gic.simulate_gic = True
+
         # IO devices has to setup before incorporating the caches in the case
         # of ruby caches. Otherwise the DMA controllers are incorrectly
         # created. The IO device has to be attached first. This is done in the
diff --git a/src/python/gem5/components/boards/experimental/lupv_board.py b/src/python/gem5/components/boards/experimental/lupv_board.py
index 2448ede45e..326f944bf4 100644
--- a/src/python/gem5/components/boards/experimental/lupv_board.py
+++ b/src/python/gem5/components/boards/experimental/lupv_board.py
@@ -185,7 +185,9 @@ class LupvBoard(AbstractSystemBoard, KernelDiskWorkload):
         # point for our bbl to use upon startup, and will
         # remain unused during the simulation
         self.pic.n_src = 0
-        self.pic.n_contexts = 0
+        self.pic.hart_config = ",".join(
+            ["M" for _ in range(self.processor.get_num_cores())]
+        )
         self.lupio_pic.n_src = max(pic_srcs) + 1
         self.lupio_pic.num_threads = self.processor.get_num_cores()
 
@@ -403,10 +405,19 @@ class LupvBoard(AbstractSystemBoard, KernelDiskWorkload):
         plic_node.append(FdtPropertyWords("riscv,ndev", 0))
 
         int_extended = list()
-        for i, core in enumerate(self.get_processor().get_cores()):
-            phandle = state.phandle(f"cpu@{i}.int_state")
-            int_extended.append(phandle)
-            int_extended.append(self._excep_code["INT_EXT_MACHINE"])
+        cpu_id = 0
+        phandle = int_state.phandle(f"cpu@{cpu_id}.int_state")
+        for c in plic.hart_config:
+            if c == ",":
+                cpu_id += 1
+                assert cpu_id < self.get_processor().get_num_cores()
+                phandle = int_state.phandle(f"cpu@{cpu_id}.int_state")
+            elif c == "S":
+                int_extended.append(phandle)
+                int_extended.append(self._excep_code["INT_SOFT_SUPER"])
+            elif c == "M":
+                int_extended.append(phandle)
+                int_extended.append(self._excep_code["INT_EXT_MACHINE"])
 
         plic_node.append(FdtPropertyWords("interrupts-extended", int_extended))
         plic_node.append(FdtProperty("interrupt-controller"))
diff --git a/src/python/gem5/components/boards/kernel_disk_workload.py b/src/python/gem5/components/boards/kernel_disk_workload.py
index dcf20081fa..cdce91d84b 100644
--- a/src/python/gem5/components/boards/kernel_disk_workload.py
+++ b/src/python/gem5/components/boards/kernel_disk_workload.py
@@ -215,7 +215,15 @@ class KernelDiskWorkload:
         if readfile:
             self.readfile = readfile
         elif readfile_contents:
-            self.readfile = os.path.join(m5.options.outdir, "readfile")
+            # We hash the contents of the readfile and append it to the
+            # readfile name. This is to ensure that we don't overwrite the
+            # readfile if the contents are different.
+            readfile_contents_hash = hex(
+                hash(tuple(bytes(readfile_contents, "utf-8")))
+            )
+            self.readfile = os.path.join(
+                m5.options.outdir, ("readfile_" + readfile_contents_hash)
+            )
 
         # Add the contents to the readfile, if specified.
         if readfile_contents:
@@ -241,3 +249,11 @@ class KernelDiskWorkload:
                     "Checkpoints must be passed as a Path or an "
                     "CheckpointResource."
                 )
+
+    def append_kernel_arg(self, arg: str) -> None:
+        """
+        Append a kernel argument to the list of kernel arguments.
+
+        :param arg: The kernel argument to append.
+        """
+        self.workload.command_line += f" {arg}"
diff --git a/src/python/gem5/components/boards/riscv_board.py b/src/python/gem5/components/boards/riscv_board.py
index dcb6fab7c5..555f723df1 100644
--- a/src/python/gem5/components/boards/riscv_board.py
+++ b/src/python/gem5/components/boards/riscv_board.py
@@ -102,7 +102,9 @@ class RiscvBoard(AbstractSystemBoard, KernelDiskWorkload):
         # Contains a CLINT, PLIC, UART, and some functions for the dtb, etc.
         self.platform = HiFive()
         # Note: This only works with single threaded cores.
-        self.platform.plic.n_contexts = self.processor.get_num_cores() * 2
+        self.platform.plic.hart_config = ",".join(
+            ["MS" for _ in range(self.processor.get_num_cores())]
+        )
         self.platform.attachPlic()
         self.platform.clint.num_threads = self.processor.get_num_cores()
 
@@ -353,12 +355,19 @@ class RiscvBoard(AbstractSystemBoard, KernelDiskWorkload):
         plic_node.append(FdtPropertyWords("riscv,ndev", [plic.n_src - 1]))
 
         int_extended = list()
-        for i, core in enumerate(self.get_processor().get_cores()):
-            phandle = state.phandle(f"cpu@{i}.int_state")
-            int_extended.append(phandle)
-            int_extended.append(0xB)
-            int_extended.append(phandle)
-            int_extended.append(0x9)
+        cpu_id = 0
+        phandle = int_state.phandle(f"cpu@{cpu_id}.int_state")
+        for c in plic.hart_config:
+            if c == ",":
+                cpu_id += 1
+                assert cpu_id < self.get_processor().get_num_cores()
+                phandle = int_state.phandle(f"cpu@{cpu_id}.int_state")
+            elif c == "S":
+                int_extended.append(phandle)
+                int_extended.append(0x9)
+            elif c == "M":
+                int_extended.append(phandle)
+                int_extended.append(0xB)
 
         plic_node.append(FdtPropertyWords("interrupts-extended", int_extended))
         plic_node.append(FdtProperty("interrupt-controller"))
diff --git a/src/python/gem5/components/boards/se_binary_workload.py b/src/python/gem5/components/boards/se_binary_workload.py
index 4db4e008ba..6f993f4d22 100644
--- a/src/python/gem5/components/boards/se_binary_workload.py
+++ b/src/python/gem5/components/boards/se_binary_workload.py
@@ -120,6 +120,15 @@ class SEBinaryWorkload:
         if env_list is not None:
             process.env = env_list
 
+        if any(
+            core.is_kvm_core() for core in self.get_processor().get_cores()
+        ):
+            # Running KVM in SE mode requires special flags to be set for the
+            # process.
+            self.m5ops_base = max(0xFFFF0000, self.get_memory().get_size())
+            process.kvmInSE = True
+            process.useArchPT = True
+
         if isinstance(self.get_processor(), SwitchableProcessor):
             # This is a hack to get switchable processors working correctly in
             # SE mode. The "get_cores" API for processors only gets the current
diff --git a/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py
index 930c6e7cff..b0435543af 100644
--- a/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/abstract_cache_hierarchy.py
@@ -1,3 +1,15 @@
+# Copyright (c) 2024 Arm Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
 # Copyright (c) 2021 The Regents of the University of California
 # All rights reserved.
 #
@@ -28,17 +40,72 @@ from abc import (
     ABCMeta,
     abstractmethod,
 )
+from typing import Callable
 
 from m5.objects import SubSystem
+from m5.util.fdthelper import *
 
 from ..boards.abstract_board import AbstractBoard
 
 
+class CacheNode:
+    def __init__(
+        self,
+        name: str,
+        cache: SimObject,
+        next_level: "CacheNode",
+        hierarchy: "AbstractCacheHierarchy",
+    ):
+        self.name = name
+        self.cache = cache
+        self.next_level = next_level
+        self.hierarchy = hierarchy
+
+        self.prev_levels = []
+
+        # Need to assign this to a SimObject
+        if cache is not None:
+            setattr(hierarchy, self.name, cache)
+
+    def add_child(self, name: str, cache: SimObject) -> "CacheNode":
+        """
+        Add a child node to the current node provided a cache object and
+        its name. Because of the intrinsic topology of caches, children will be
+        one level higher than their parent in the hierarchy.
+        This means the chain of insertions to the tree will be something
+        like:
+        l3.add_child("l2", l2).add_child("l1", l1)
+
+        :param name: The name of the cache
+        :param cache: The cache SimObject
+        :returns: The new child node being generated
+        """
+        new_node = CacheNode(name, cache, self, self.hierarchy)
+        self.prev_levels.append(new_node)
+        return new_node
+
+    def generate_dtb_entry(self, state, level):
+        node = FdtNode(f"{self.name}")
+        node.append(FdtPropertyStrings("compatible", ["cache"]))
+        node.append(FdtPropertyWords("cache-level", int(level)))
+        node.append(FdtPropertyWords("cache-size", int(self.cache.size)))
+        if self.next_level:
+            node.append(
+                FdtPropertyWords(
+                    "next-level-cache", state.phandle(self.next_level.cache)
+                )
+            )
+
+        node.appendPhandle(self.cache)
+        return node
+
+
 class AbstractCacheHierarchy(SubSystem):
     __metaclass__ = ABCMeta
 
     def __init__(self):
         super().__init__()
+        self._root = CacheNode("root", None, None, self)
 
     """
     A Cache Hierarchy incorporates any system components which manages
@@ -75,3 +142,46 @@ class AbstractCacheHierarchy(SubSystem):
     def _post_instantiate(self):
         """Called to set up anything needed after ``m5.instantiate``."""
         pass
+
+    def add_root_child(self, *args, **kwargs):
+        """This adds the LLC to the root node"""
+        return self._root.add_child(*args, **kwargs)
+
+    def traverse(
+        self, node: CacheNode, visit: Callable[[CacheNode, int], None]
+    ) -> int:
+        """
+        Traverse the tree in post-order. Return the level of the
+        current node passed as an argument. The method accepts
+        a visit function to be called at each node
+
+        :param node: starting node for traversal
+        :param visit: visiting function to be called at each node
+
+        :returns: level of the node passed as an argument
+        """
+        if not node.prev_levels:
+            level = 1
+        else:
+            for prev in node.prev_levels:
+                level = self.traverse(prev, visit)
+
+        visit(node, level)
+
+        return level + 1
+
+    def generateDeviceTree(self, state):
+        dt_entries = []
+
+        def add_dt_entry(node, level):
+            # Do not generate a DTB entry for the root node
+            # as it does not point to a real cache (node.cache = None)
+            # and for the L1I and L1D caches as their data should be
+            # part of the CPU node as described by:
+            # https://devicetree-specification.readthedocs.io/en/stable/devicenodes.html
+            if node.cache is not None and level != 1:
+                dt_entries.append(node.generate_dtb_entry(state, level))
+
+        self.traverse(self._root, add_dt_entry)
+
+        yield from dt_entries
diff --git a/src/python/gem5/components/cachehierarchies/classic/no_cache.py b/src/python/gem5/components/cachehierarchies/classic/no_cache.py
index 3fb37bb43c..e6ec89b660 100644
--- a/src/python/gem5/components/cachehierarchies/classic/no_cache.py
+++ b/src/python/gem5/components/cachehierarchies/classic/no_cache.py
@@ -24,6 +24,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from typing import Optional
+
 from m5.objects import (
     BadAddr,
     BaseXBar,
@@ -64,8 +66,7 @@ class NoCache(AbstractClassicCacheHierarchy):
 
     """
 
-    @staticmethod
-    def _get_default_membus() -> SystemXBar:
+    def _get_default_membus(self) -> SystemXBar:
         """
         A method used to obtain the default memory bus of 64 bit in width for
         the NoCache CacheHierarchy.
@@ -82,18 +83,16 @@ class NoCache(AbstractClassicCacheHierarchy):
         membus.max_routing_table_size = 2048
         return membus
 
-    def __init__(
-        self, membus: BaseXBar = _get_default_membus.__func__()
-    ) -> None:
+    def __init__(self, membus: Optional[BaseXBar] = None) -> None:
         """
         :param membus: The memory bus for this setup. This parameter is
                        optional and will default toa 64 bit width SystemXBar
-                       is not specified.
+                       if not specified.
 
         :type membus: BaseXBar
         """
         super().__init__()
-        self.membus = membus
+        self.membus = membus if membus else self._get_default_membus()
 
     @overrides(AbstractClassicCacheHierarchy)
     def get_mem_side_port(self) -> Port:
diff --git a/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py
index 97348fdbe5..25063fc113 100644
--- a/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/classic/private_l1_cache_hierarchy.py
@@ -24,6 +24,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from typing import Optional
+
 from m5.objects import (
     BadAddr,
     BaseXBar,
@@ -47,8 +49,7 @@ class PrivateL1CacheHierarchy(AbstractClassicCacheHierarchy):
     A cache setup where each core has a private L1 data and instruction Cache.
     """
 
-    @staticmethod
-    def _get_default_membus() -> SystemXBar:
+    def _get_default_membus(self) -> SystemXBar:
         """
         A method used to obtain the default memory bus of 64 bit in width for
         the PrivateL1CacheHierarchy.
@@ -65,7 +66,7 @@ class PrivateL1CacheHierarchy(AbstractClassicCacheHierarchy):
         self,
         l1d_size: str,
         l1i_size: str,
-        membus: BaseXBar = _get_default_membus.__func__(),
+        membus: Optional[BaseXBar] = None,
     ) -> None:
         """
         :param l1d_size: The size of the L1 Data Cache (e.g., "32kB").
@@ -73,11 +74,12 @@ class PrivateL1CacheHierarchy(AbstractClassicCacheHierarchy):
         :param  l1i_size: The size of the L1 Instruction Cache (e.g., "32kB").
 
         :param membus: The memory bus. This parameter is optional parameter and
-                       will default to a 64 bit width SystemXBar is not specified.
+                       will default to a 64 bit width SystemXBar is not
+                       specified.
         """
 
         AbstractClassicCacheHierarchy.__init__(self=self)
-        self.membus = membus
+        self.membus = membus if membus else self._get_default_membus()
         self._l1d_size = l1d_size
         self._l1i_size = l1i_size
 
diff --git a/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py
index d1d43ec0c8..3996418a8c 100644
--- a/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_cache_hierarchy.py
@@ -1,3 +1,15 @@
+# Copyright (c) 2024 Arm Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
 # Copyright (c) 2021 The Regents of the University of California
 # All rights reserved.
 #
@@ -24,8 +36,11 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from typing import Optional
+
 from m5.objects import (
     BadAddr,
+    BaseCPU,
     BaseXBar,
     Cache,
     L2XBar,
@@ -42,7 +57,6 @@ from .abstract_classic_cache_hierarchy import AbstractClassicCacheHierarchy
 from .caches.l1dcache import L1DCache
 from .caches.l1icache import L1ICache
 from .caches.l2cache import L2Cache
-from .caches.mmu_cache import MMUCache
 
 
 class PrivateL1PrivateL2CacheHierarchy(
@@ -53,8 +67,7 @@ class PrivateL1PrivateL2CacheHierarchy(
     and a private L2 cache.
     """
 
-    @staticmethod
-    def _get_default_membus() -> SystemXBar:
+    def _get_default_membus(self) -> SystemXBar:
         """
         A method used to obtain the default memory bus of 64 bit in width for
         the PrivateL1PrivateL2 CacheHierarchy.
@@ -73,7 +86,7 @@ class PrivateL1PrivateL2CacheHierarchy(
         l1d_size: str,
         l1i_size: str,
         l2_size: str,
-        membus: BaseXBar = _get_default_membus.__func__(),
+        membus: Optional[BaseXBar] = None,
     ) -> None:
         """
         :param l1d_size: The size of the L1 Data Cache (e.g., "32kB").
@@ -83,7 +96,8 @@ class PrivateL1PrivateL2CacheHierarchy(
         :param l2_size: The size of the L2 Cache (e.g., "256kB").
 
         :param membus: The memory bus. This parameter is optional parameter and
-                       will default to a 64 bit width SystemXBar is not specified.
+                       will default to a 64 bit width SystemXBar is not
+                       specified.
         """
 
         AbstractClassicCacheHierarchy.__init__(self=self)
@@ -97,7 +111,7 @@ class PrivateL1PrivateL2CacheHierarchy(
             l2_assoc=4,
         )
 
-        self.membus = membus
+        self.membus = membus if membus else self._get_default_membus()
 
     @overrides(AbstractClassicCacheHierarchy)
     def get_mem_side_port(self) -> Port:
@@ -115,51 +129,31 @@ class PrivateL1PrivateL2CacheHierarchy(
         for _, port in board.get_memory().get_mem_ports():
             self.membus.mem_side_ports = port
 
-        self.l1icaches = [
-            L1ICache(size=self._l1i_size)
-            for i in range(board.get_processor().get_num_cores())
-        ]
-        self.l1dcaches = [
-            L1DCache(size=self._l1d_size)
-            for i in range(board.get_processor().get_num_cores())
-        ]
         self.l2buses = [
             L2XBar() for i in range(board.get_processor().get_num_cores())
         ]
-        self.l2caches = [
-            L2Cache(size=self._l2_size)
-            for i in range(board.get_processor().get_num_cores())
-        ]
-        # ITLB Page walk caches
-        self.iptw_caches = [
-            MMUCache(size="8KiB")
-            for _ in range(board.get_processor().get_num_cores())
-        ]
-        # DTLB Page walk caches
-        self.dptw_caches = [
-            MMUCache(size="8KiB")
-            for _ in range(board.get_processor().get_num_cores())
-        ]
-
-        if board.has_coherent_io():
-            self._setup_io_cache(board)
 
         for i, cpu in enumerate(board.get_processor().get_cores()):
-            cpu.connect_icache(self.l1icaches[i].cpu_side)
-            cpu.connect_dcache(self.l1dcaches[i].cpu_side)
-
-            self.l1icaches[i].mem_side = self.l2buses[i].cpu_side_ports
-            self.l1dcaches[i].mem_side = self.l2buses[i].cpu_side_ports
-            self.iptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports
-            self.dptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports
-
-            self.l2buses[i].mem_side_ports = self.l2caches[i].cpu_side
-
-            self.membus.cpu_side_ports = self.l2caches[i].mem_side
-
-            cpu.connect_walker_ports(
-                self.iptw_caches[i].cpu_side, self.dptw_caches[i].cpu_side
+            l2_node = self.add_root_child(
+                f"l2-cache-{i}", L2Cache(size=self._l2_size)
             )
+            l1i_node = l2_node.add_child(
+                f"l1i-cache-{i}", L1ICache(size=self._l1i_size)
+            )
+            l1d_node = l2_node.add_child(
+                f"l1d-cache-{i}", L1DCache(size=self._l1d_size)
+            )
+
+            self.l2buses[i].mem_side_ports = l2_node.cache.cpu_side
+            self.membus.cpu_side_ports = l2_node.cache.mem_side
+
+            l1i_node.cache.mem_side = self.l2buses[i].cpu_side_ports
+            l1d_node.cache.mem_side = self.l2buses[i].cpu_side_ports
+
+            cpu.connect_icache(l1i_node.cache.cpu_side)
+            cpu.connect_dcache(l1d_node.cache.cpu_side)
+
+            self._connect_table_walker(i, cpu)
 
             if board.get_processor().get_isa() == ISA.X86:
                 int_req_port = self.membus.mem_side_ports
@@ -168,6 +162,14 @@ class PrivateL1PrivateL2CacheHierarchy(
             else:
                 cpu.connect_interrupt()
 
+        if board.has_coherent_io():
+            self._setup_io_cache(board)
+
+    def _connect_table_walker(self, cpu_id: int, cpu: BaseCPU) -> None:
+        cpu.connect_walker_ports(
+            self.membus.cpu_side_ports, self.membus.cpu_side_ports
+        )
+
     def _setup_io_cache(self, board: AbstractBoard) -> None:
         """Create a cache for coherent I/O connections"""
         self.iocache = Cache(
diff --git a/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_walk_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_walk_cache_hierarchy.py
new file mode 100644
index 0000000000..3f5999f0f5
--- /dev/null
+++ b/src/python/gem5/components/cachehierarchies/classic/private_l1_private_l2_walk_cache_hierarchy.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024 Arm Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Copyright (c) 2021 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.objects import BaseCPU
+
+from ....utils.override import *
+from ...boards.abstract_board import AbstractBoard
+from .caches.mmu_cache import MMUCache
+from .private_l1_private_l2_cache_hierarchy import (
+    PrivateL1PrivateL2CacheHierarchy,
+)
+
+
+class PrivateL1PrivateL2WalkCacheHierarchy(PrivateL1PrivateL2CacheHierarchy):
+    """
+    A cache setup where each core has a private L1 Data and Instruction Cache,
+    and a private L2 cache and a Walk Cache for the Table Walker
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        PrivateL1PrivateL2CacheHierarchy.__init__(self, *args, **kwargs)
+
+    @overrides(PrivateL1PrivateL2CacheHierarchy)
+    def incorporate_cache(self, board: AbstractBoard) -> None:
+        # ITLB Page walk caches
+        self.iptw_caches = [
+            MMUCache(size="8KiB")
+            for _ in range(board.get_processor().get_num_cores())
+        ]
+        # DTLB Page walk caches
+        self.dptw_caches = [
+            MMUCache(size="8KiB")
+            for _ in range(board.get_processor().get_num_cores())
+        ]
+
+        super().incorporate_cache(board)
+
+        for i, cpu in enumerate(board.get_processor().get_cores()):
+            self.iptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports
+            self.dptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports
+
+    def _connect_table_walker(self, cpu_id: int, cpu: BaseCPU) -> None:
+        cpu.connect_walker_ports(
+            self.iptw_caches[cpu_id].cpu_side,
+            self.dptw_caches[cpu_id].cpu_side,
+        )
diff --git a/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py b/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py
index 4e2c622d62..133da14755 100644
--- a/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py
+++ b/src/python/gem5/components/cachehierarchies/classic/private_l1_shared_l2_cache_hierarchy.py
@@ -24,6 +24,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from typing import Optional
+
 from m5.objects import (
     BadAddr,
     BaseXBar,
@@ -54,8 +56,7 @@ class PrivateL1SharedL2CacheHierarchy(
     inclusive with respect to the split I/D L1 and MMU caches.
     """
 
-    @staticmethod
-    def _get_default_membus() -> SystemXBar:
+    def _get_default_membus(self) -> SystemXBar:
         """
         A method used to obtain the default memory bus of 64 bit in width for
         the PrivateL1SharedL2 CacheHierarchy.
@@ -78,7 +79,7 @@ class PrivateL1SharedL2CacheHierarchy(
         l1d_assoc: int = 8,
         l1i_assoc: int = 8,
         l2_assoc: int = 16,
-        membus: BaseXBar = _get_default_membus.__func__(),
+        membus: Optional[BaseXBar] = None,
     ) -> None:
         """
         :param l1d_size: The size of the L1 Data Cache (e.g., "32kB").
@@ -88,7 +89,8 @@ class PrivateL1SharedL2CacheHierarchy(
         :param l1i_assoc: The associativity of the L1 Instruction Cache.
         :param l2_assoc: The associativity of the L2 Cache.
         :param membus: The memory bus. This parameter is optional parameter and
-                       will default to a 64 bit width SystemXBar is not specified.
+                       will default to a 64 bit width SystemXBar is not
+                       specified.
         """
 
         AbstractClassicCacheHierarchy.__init__(self=self)
@@ -102,7 +104,7 @@ class PrivateL1SharedL2CacheHierarchy(
             l2_assoc=l2_assoc,
         )
 
-        self.membus = membus
+        self.membus = membus if membus else self._get_default_membus()
 
     @overrides(AbstractClassicCacheHierarchy)
     def get_mem_side_port(self) -> Port:
diff --git a/src/python/gem5/components/memory/abstract_memory_system.py b/src/python/gem5/components/memory/abstract_memory_system.py
index e214602732..06fa60cad8 100644
--- a/src/python/gem5/components/memory/abstract_memory_system.py
+++ b/src/python/gem5/components/memory/abstract_memory_system.py
@@ -83,6 +83,14 @@ class AbstractMemorySystem(SubSystem):
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def get_uninterleaved_range(self) -> List[AddrRange]:
+        """Returns the range of the memory system without interleaving.
+        This is useful when other components in the system want to interleave
+        the memory range different to how the memory has interleaved them.
+        """
+        raise NotImplementedError
+
     def _post_instantiate(self) -> None:
         """Called to set up anything needed after ``m5.instantiate``."""
         pass
diff --git a/src/python/gem5/components/memory/memory.py b/src/python/gem5/components/memory/memory.py
index 6f17a9f7b3..2d922c8e83 100644
--- a/src/python/gem5/components/memory/memory.py
+++ b/src/python/gem5/components/memory/memory.py
@@ -198,8 +198,12 @@ class ChanneledMemory(AbstractMemorySystem):
             raise Exception(
                 "Multi channel memory controller requires a single range "
                 "which matches the memory's size.\n"
-                f"The range size: {range[0].size()}\n"
+                f"The range size: {ranges[0].size()}\n"
                 f"This memory's size: {self._size}"
             )
         self._mem_range = ranges[0]
         self._interleave_addresses()
+
+    @overrides(AbstractMemorySystem)
+    def get_uninterleaved_range(self) -> List[AddrRange]:
+        return [self._mem_range]
diff --git a/src/python/gem5/components/processors/spatter_gen/__init__.py b/src/python/gem5/components/processors/spatter_gen/__init__.py
new file mode 100644
index 0000000000..25c603781f
--- /dev/null
+++ b/src/python/gem5/components/processors/spatter_gen/__init__.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+from .spatter_generator import SpatterGenerator
+from .spatter_kernel import (
+    SpatterKernel,
+    parse_kernel,
+    partition_trace,
+    prepare_kernels,
+    unroll_trace,
+)
diff --git a/src/python/gem5/components/processors/spatter_gen/spatter_generator.py b/src/python/gem5/components/processors/spatter_gen/spatter_generator.py
new file mode 100644
index 0000000000..72939f82dc
--- /dev/null
+++ b/src/python/gem5/components/processors/spatter_gen/spatter_generator.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import (
+    List,
+    Optional,
+    Union,
+)
+
+from m5.objects import (
+    SpatterProcessingMode,
+    SrcClockDomain,
+    VoltageDomain,
+)
+from m5.stats import dump as dump_stats
+from m5.stats import reset as reset_stats
+from m5.util import fatal
+
+from ....utils.override import overrides
+from ..abstract_generator import AbstractGenerator
+from .spatter_generator_core import SpatterGeneratorCore
+from .spatter_kernel import SpatterKernel
+
+
+class SpatterGenerator(AbstractGenerator):
+    def __init__(
+        self,
+        num_cores: int = 1,
+        processing_mode: Union[SpatterProcessingMode, str] = "synchronous",
+        int_regfile_size: int = 384,
+        fp_regfile_size: int = 224,
+        request_gen_latency: int = 2,
+        request_gen_rate: int = 4,
+        request_buffer_entries: int = 32,
+        send_rate: int = 2,
+        clk_freq: Optional[str] = None,
+    ) -> None:
+        super().__init__(
+            cores=self._create_cores(
+                num_cores,
+                processing_mode,
+                int_regfile_size,
+                fp_regfile_size,
+                request_gen_latency,
+                request_gen_rate,
+                request_buffer_entries,
+                send_rate,
+            )
+        )
+        # no need for else block since it will intialize generator.clk_domain
+        # the clock domain of its closest ancestor in the SimObject tree.
+        if not clk_freq is None:
+            clock_domain = SrcClockDomain(
+                clock=clk_freq, voltage_domain=VoltageDomain()
+            )
+            for generator in self.cores:
+                generator.clk_domain = clock_domain
+
+        self._num_kernels = 0
+        self._sync = processing_mode == "synchronous"
+
+    def _create_cores(
+        self,
+        num_cores: int,
+        processing_mode: Union[SpatterProcessingMode, str],
+        int_regfile_size: int,
+        fp_regfile_size: int,
+        request_gen_latency: int,
+        request_gen_rate: int,
+        request_buffer_entries: int,
+        send_rate: int,
+    ) -> List[SpatterGeneratorCore]:
+        return [
+            SpatterGeneratorCore(
+                processing_mode,
+                int_regfile_size,
+                fp_regfile_size,
+                request_gen_latency,
+                request_gen_rate,
+                request_buffer_entries,
+                send_rate,
+            )
+            for _ in range(num_cores)
+        ]
+
+    def add_kernel(self, kernels: List[SpatterKernel]) -> None:
+        assert len(kernels) == len(self.cores)
+        for core, kernel in zip(self.cores, kernels):
+            if kernel.empty():
+                fatal(
+                    f"Cannot add {kernel} since it's empty. "
+                    "At the moment SpatterGenerator (or gem5::SpatterGen) "
+                    "does not support adding empty kernels to cores. As a "
+                    "temporary fix you can try adding 1 dummy element to the "
+                    "trace. You can also set fix_empty_trace to True in the "
+                    "constructor of the SpatterKernel which automatically "
+                    "inserts a dummy element (0) to the trace."
+                )
+            core.add_kernel(kernel)
+        self._num_kernels += 1
+
+    @overrides(AbstractGenerator)
+    def start_traffic(self) -> None:
+        for core in self.cores:
+            core.start_traffic()
+
+    def _proceed_past_sync_point(self) -> None:
+        if not self._sync:
+            return
+        for core in self.cores:
+            core.generator.proceedPastSyncPoint()
+
+    def handle_spatter_exit(self):
+        spatter_exits_observed = 0
+        sync_points_observed = 0
+        sync_points_expected = self._num_kernels if self._sync else 1
+        while True:
+            spatter_exits_observed += 1
+            if spatter_exits_observed % len(self.cores) == 0:
+                sync_points_observed += 1
+                dump_stats()
+                reset_stats()
+                self._proceed_past_sync_point()
+            yield not (sync_points_observed < sync_points_expected)
diff --git a/src/python/gem5/components/processors/spatter_gen/spatter_generator_core.py b/src/python/gem5/components/processors/spatter_gen/spatter_generator_core.py
new file mode 100644
index 0000000000..50799eae84
--- /dev/null
+++ b/src/python/gem5/components/processors/spatter_gen/spatter_generator_core.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import Union
+
+from m5.objects import (
+    Port,
+    SpatterGen,
+    SpatterProcessingMode,
+)
+
+from ....utils.override import overrides
+from ..abstract_core import AbstractCore
+from ..abstract_generator_core import AbstractGeneratorCore
+from .spatter_kernel import SpatterKernel
+
+
+class SpatterGeneratorCore(AbstractGeneratorCore):
+    def __init__(
+        self,
+        processing_mode: Union[SpatterProcessingMode, str],
+        int_regfile_size: int,
+        fp_regfile_size: int,
+        request_gen_latency: int,
+        request_gen_rate: int,
+        request_buffer_entries: int,
+        send_rate: int,
+    ):
+        super().__init__()
+        self.generator = SpatterGen(
+            processing_mode=processing_mode,
+            int_regfile_size=int_regfile_size,
+            fp_regfile_size=fp_regfile_size,
+            request_gen_latency=request_gen_latency,
+            request_gen_rate=request_gen_rate,
+            request_buffer_entries=request_buffer_entries,
+            send_rate=send_rate,
+        )
+        self._kernels = []
+
+    @overrides(AbstractCore)
+    def connect_dcache(self, port: Port) -> None:
+        self.generator.port = port
+
+    def add_kernel(self, kernel: SpatterKernel) -> None:
+        self._kernels.append(kernel)
+
+    def start_traffic(self) -> None:
+        for kernel in self._kernels:
+            self.generator.addKernel(*kernel.cxx_call_args())
diff --git a/src/python/gem5/components/processors/spatter_gen/spatter_kernel.py b/src/python/gem5/components/processors/spatter_gen/spatter_kernel.py
new file mode 100644
index 0000000000..d48b620cc1
--- /dev/null
+++ b/src/python/gem5/components/processors/spatter_gen/spatter_kernel.py
@@ -0,0 +1,384 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+from math import ceil
+from pathlib import Path
+from typing import (
+    List,
+    Tuple,
+)
+
+from m5.objects import SpatterKernelType
+from m5.params import Addr
+from m5.util import inform
+
+
+class SpatterKernel:
+    """This class encapsulates one kernel in a spatter trace.
+        A spatter trace is represented with a json file.
+        An example of a spatter trace can be found here:
+    https://github.com/hpcgarage/spatter/blob/main/standard-suite/app-traces/amg.json
+        Each trace may have multiple kernels.
+        Each kernel represents a code execution like below
+            for (int iteration = 0; iteration < count; iteration++)
+            {
+                for (int i = 0; i < N; i++) {
+                    value[index[i] + iteration * delta] = rand(); // kernel: scatter
+                    // OR
+                    sum += value[index[i] + iteration * delta]; // kernel: gather
+                }
+            }
+        Where `delta` and `count` are fields in each kernel.
+        `kernel` is another field that determines whether the accesses to value
+        are loads or stores.
+        The field `pattern` stores the index array.
+
+        This file provides four utility function to parse spatter traces:
+            parse_kernel: takes a dictionary and returns a tuple of
+            delta, count, type, and trace.
+            partition_trace: takes an original trace, number of partitions,
+            and interleave_size.
+            It returns a list of `num_partitions` partitions where each
+            partition includes interleaved elements from `original_trace`.
+            The elements in the `original_trace` are interleaved with a
+            granularity of `interleave_size`.
+            unroll_trace: takes an original trace, delta, count, and minimum
+            number of elements.
+            It will unroll `original_trace` by adding `delta` to the last
+            `og_len` (len(`original_trace`)) elements of the trace in steps.
+            In each step it will decrement `count` by 1.
+            If the logical length of `original_trace` (`og_len` * `count`) is
+            smaller than `min_elements`, it allows for filling the trace with
+            zeros or elements from the pattern.
+            By filling elements from the pattern, the unrolling process goes
+            beyond the `count` limit.
+            However, it will not decrements `count`.
+            prepare_kernels: takes a trace_path, number of cores,
+            interleave_size, base_index_addr, and base_value_addr.
+            It will return a list of lists of kernels where each list of
+            kernels represents a kernel with a length of `num_cores`.
+        The code snippet below shows how to use these functions to create kernels.
+            generator = SpatterGenerator(num_cores)
+
+            with open(trace_path, "r") as trace_file:
+                kernels = json.load(trace_file)
+
+            for i, kernel in enumerate(kernels):
+                delta, count, type, og_trace = parse_kernel(kernel)
+                traces = partition_trace(og_trace, num_cores, 128)
+                kernels = [SpatterKernel(
+                                        kernel_id=i,
+                                        kernel_delta=delta,
+                                        kernel_count=count,
+                                        kernel_type=type,
+                                        kernel_trace=trace,
+                                        index_size=4,
+                                        base_index_addr=0,
+                                        value_size=8,
+                                        base_value_addr=0x400000000
+                                        )
+                            for trace in traces
+                            ]
+                generator.add_kernel(kernels)
+
+        Args:
+            kernel_id (int): The ID of the kernel.
+            User defined, i.e. spatter traces don't have this field.
+            It's used to identify the kernel in the simulation.
+            kernel_delta (int): The delta value of the kernel.
+            `delta` from spatter trace.
+            kernel_count (int): The count value of the kernel.
+            `count` from spatter trace.
+            kernel_type (SpatterKernelType): The type of the kernel.
+            `kernel` from spatter trace.
+            base_index (int): The index from the index array to start from.
+            It's most meaningful when used in multi-generator simulations.
+            User defined, i.e. spatter traces don't have this field.
+            indices_per_stride (int): The number of indices from the index
+            array to read from before making a jump of size `stride_size`.
+            User defined, i.e. spatter traces don't have this field.
+            stride_size (int): The size of the jump to make after reading
+            `indices_per_stride` indices from the index array.
+            User defined, i.e. spatter traces don't have this field.
+            kernel_trace (List[int]): The elements of the `index` array.
+            `pattern` from spatter trace.
+            index_size (int): The size of elements in `index`.
+            User defined, i.e. spatter traces don't have this field.
+            It represents the size of elements in the `index` array in code above.
+            base_index_addr (Addr): The base address of the index.
+            User defined, i.e. spatter traces don't have this field.
+            It represents the pointer to the `index` array in the code above.
+            value_size (int): The size of elements in `value`.
+            User defined, i.e. spatter traces don't have this field.
+            It represents the size of elements in the `value` array in code above.
+            base_value_addr (Addr): The base address of the value.
+            User defined, i.e. spatter traces don't have this field.
+            It represents the pointer to the `value` array in the code above.
+    """
+
+    def __init__(
+        self,
+        kernel_id: int,
+        kernel_delta: int,
+        kernel_count: int,
+        kernel_type: SpatterKernelType,
+        base_index: int,
+        indices_per_stride: int,
+        stride_size: int,
+        index_size: int,
+        base_index_addr: Addr,
+        value_size: int,
+        base_value_addr: Addr,
+        kernel_trace: List[int],
+    ):
+        self._id = kernel_id
+        self._delta = kernel_delta
+        self._count = kernel_count
+        self._type = kernel_type
+        self._base_index = base_index
+        self._indices_per_stride = indices_per_stride
+        self._stride_size = stride_size
+        self._index_size = index_size
+        self._base_index_addr = base_index_addr
+        self._value_size = value_size
+        self._base_value_addr = base_value_addr
+        self._trace = kernel_trace
+
+    def empty(self):
+        return len(self._trace) == 0
+
+    def cxx_call_args(self):
+        return [
+            self._id,
+            self._delta,
+            self._count,
+            self._type.getValue(),
+            self._base_index,
+            self._indices_per_stride,
+            self._stride_size,
+            self._index_size,
+            self._base_index_addr,
+            self._value_size,
+            self._base_value_addr,
+            self._trace,
+        ]
+
+    def __str__(self):
+        return (
+            f"SpatterKernel(id={self._id}, delta={self._delta}, "
+            f"count={self._count}, type={self._type}, "
+            f"trace[:8]={self._trace[:8]}"
+        )
+
+
+def parse_kernel(kernel: dict, default_delta=8) -> Tuple[int, int, str, List]:
+    """
+    Function to parse a kernel from a dictionary. Each Spatter trace is
+    represented as a list of dictionaries in JSON. Each dictionary in the list
+    represents a kernel. This function will one kernel and return a tuple of
+    delta, count, type, and trace.
+    Args:
+        kernel (dict): A dictionary representing a kernel.
+        default_delta (int): The default delta value to use when the delta
+        value is not found in the kernel dictionary.
+        Returns:
+            Tuple[int, int, str, List]: A tuple of delta, count, type,
+            and trace extracted from the kernel.
+    """
+    delta = kernel.get("delta", default_delta)
+    if delta < 0:
+        inform(
+            f"Negative delta found: {delta}. Setting it to {default_delta}. "
+            "You can change the default delta value by passing "
+            "`default_detla` argument to this function."
+        )
+        delta = default_delta
+    count = kernel.get("count", 1)
+    type = kernel.get("kernel", None)
+    if type is None:
+        raise ValueError(f"Keyword 'kernel' not found.")
+    type = SpatterKernelType(type.lower())
+    trace = kernel.get("pattern", [])
+    if len(trace) == 0:
+        raise ValueError(f"Empty 'pattern' found.")
+    return (delta, count, type, trace)
+
+
+def unroll_trace(
+    original_trace: List,
+    delta: int,
+    count: int,
+    min_elements: int,
+    fill_zero=False,
+    fill_pattern=False,
+):
+    """
+    Function to unroll a trace by creating replicated elements in the trace.
+    This function will add `delta` to the last `og_len` (len(`original_trace`))
+    elements of the trace in steps. In each step it will decrement `count`.
+    If the logical length of `original_trace` (`og_len` * `count`) is smaller
+    than `min_elements`, it allows for filling the trace with zeros or elements
+    from the pattern. By filling elements from the pattern, the unrolling
+    process goes beyond the `count` limit. However, it will not decrement
+    `count`.
+
+    Args:
+        original_trace (List): The original trace to unroll.
+        delta (int): The delta value as provided from the kernel in JSON.
+        count (int): The count value as provided from the kernel in JSON.
+        min_elements (int): The minimum number of elements the trace should
+        have after unrolling.
+        fill_zero (bool): If True, the trace will be filled with zeros when
+        the unrolling process runs out of elements from the original trace.
+        fill_pattern (bool): If True, the trace will be filled with the pattern
+        allowing to go over the `count` limit (from the kernel in JSON) when
+        unrolling.
+    """
+    if fill_zero and fill_pattern:
+        raise ValueError(
+            f"Only one of fill_zero or fill_pattern can be True. "
+            "However, both can be False at the same time."
+        )
+
+    if (len(original_trace) * count) < min_elements and (
+        not fill_zero and not fill_pattern
+    ):
+        raise ValueError(
+            f"Trace is too small (len(`pattern`) * `count`) < {min_elements}. "
+            f"It will not have {min_elements} elements after unrolling. "
+            "You can set fill_zero or fill_pattern to True to fill pattern. "
+            "fill_zero will fill with zeros when the unrolling process runs "
+            "out of elements from the original trace. "
+            "fill_pattern will fill with the pattern allowing to go over the "
+            "`count` limit (from the kernel in JSON) when unrolling."
+        )
+
+    og_len = len(original_trace)
+    ret_count = count
+    ret_trace = original_trace
+    while (len(ret_trace) < min_elements) and ret_count > 1:
+        ret_trace += [element + delta for element in ret_trace[-og_len:]]
+        ret_count -= 1
+    if (len(ret_trace) < min_elements) and fill_zero:
+        inform(
+            "You have chosen to fill the trace with zero "
+            f"until it reaches at least {min_elements} elements."
+        )
+        ret_trace += [0] * (min_elements - len(ret_trace))
+    if (len(ret_trace) < min_elements) and fill_pattern:
+        inform(
+            "You have chosen to fill the trace with the pattern "
+            "(without dectementing count) until it "
+            f"reaches at least {min_elements} elements."
+        )
+        while len(ret_trace) < min_elements:
+            ret_trace += [element + delta for element in ret_trace[-og_len:]]
+    return ret_count, ret_trace
+
+
+def partition_trace(original_trace, num_partitions, interleave_size):
+    if len(original_trace) < (num_partitions * interleave_size):
+        raise ValueError(
+            "Trace (`original_trace`) is too small for the "
+            "given number of partitions and interleave size. "
+            "The trace (`original_trace`) should have at least "
+            "`num_partitions` * `interleave_size` elements."
+            "It might be due to either the trace being too small "
+            "or it being folded too many times. You can solve "
+            "this issue by using the `unroll_trace` function. "
+        )
+    partitions = [[] for _ in range(num_partitions)]
+    num_leaves = ceil(len(original_trace) / interleave_size)
+    for i in range(num_leaves):
+        lower_bound = i * interleave_size
+        upper_bound = min(lower_bound + interleave_size, len(original_trace))
+        partitions[i % num_partitions] += original_trace[
+            lower_bound:upper_bound
+        ]
+    return partitions
+
+
+def prepare_kernels(
+    trace_path: Path,
+    num_cores: int,
+    interleave_size: int,
+    base_index_addr: Addr,
+    base_value_addr: Addr,
+) -> List[List[SpatterKernel]]:
+    """
+    Function to prepare kernels from a spatter trace. It will read the trace
+    from the given path and prepare kernels for the given number of cores and
+    interleave size. It will partition the trace into `num_cores` partitions
+    with `interleave_size` elements in each partition. It will also unroll the
+    trace to have at least `num_cores` * `interleave_size` elements in the
+    trace using the `unroll_trace` function. In case the trace is too small,
+    it will ask `unroll_trace` to fill the trace with elements from the
+    pattern. It will return a list of list of kernels where each list of
+    kernels represents a kernel with a length of `num_cores`.
+    Args:
+        trace_path (Path): Path to the spatter trace.
+        num_cores (int): Number of cores to partition the trace.
+        interleave_size (int): Number of elements to interleave the trace by.
+        base_index_addr (Addr): The base address of the index array.
+        base_value_addr (Addr): The base address of the value array.
+    Returns:
+        List[List[SpatterKernel]]: A list of list of kernels where each list
+        of kernels represents a kernel with a length of `num_cores`.
+    """
+    trace_file = trace_path.open("r")
+    kernels = json.load(trace_file)
+    ret = []
+    for i, kernel in enumerate(kernels):
+        delta, count, type, og_trace = parse_kernel(kernel)
+        new_count, unrolled_trace = unroll_trace(
+            og_trace,
+            delta,
+            count,
+            num_cores * interleave_size,
+            fill_pattern=True,
+        )
+        traces = partition_trace(unrolled_trace, num_cores, interleave_size)
+        temp = []
+        for j, trace in enumerate(traces):
+            temp.append(
+                SpatterKernel(
+                    kernel_id=i,
+                    kernel_delta=delta,
+                    kernel_count=new_count,
+                    kernel_type=type,
+                    base_index=j * interleave_size,
+                    indices_per_stride=interleave_size,
+                    stride_size=interleave_size * num_cores,
+                    index_size=4,
+                    base_index_addr=base_index_addr,
+                    value_size=8,
+                    base_value_addr=base_value_addr,
+                    kernel_trace=trace,
+                )
+            )
+        ret.append(temp)
+    return ret
diff --git a/src/python/gem5/components/processors/strided_generator.py b/src/python/gem5/components/processors/strided_generator.py
new file mode 100644
index 0000000000..e14d4f5e0d
--- /dev/null
+++ b/src/python/gem5/components/processors/strided_generator.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2023 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import (
+    List,
+    Optional,
+)
+
+from gem5.components.processors.abstract_generator import AbstractGenerator
+from gem5.utils.override import overrides
+
+from .strided_generator_core import StridedGeneratorCore
+
+
+class StridedGenerator(AbstractGenerator):
+    def __init__(
+        self,
+        num_cores: int = 1,
+        duration: str = "1ms",
+        rate: str = "100GB/s",
+        block_size: int = 64,
+        superblock_size: int = 64,
+        stride_size: Optional[int] = None,
+        min_addr: int = 0,
+        max_addr: int = 32768,
+        rd_perc: int = 100,
+        data_limit: int = 0,
+    ) -> None:
+        if stride_size is None:
+            stride_size = num_cores * superblock_size
+        super().__init__(
+            cores=self._create_cores(
+                num_cores=num_cores,
+                duration=duration,
+                rate=rate,
+                block_size=block_size,
+                superblock_size=superblock_size,
+                stride_size=stride_size,
+                min_addr=min_addr,
+                max_addr=max_addr,
+                rd_perc=rd_perc,
+                data_limit=data_limit,
+            )
+        )
+
+    def _create_cores(
+        self,
+        num_cores: int,
+        duration: str,
+        rate: str,
+        block_size: int,
+        superblock_size: int,
+        stride_size: int,
+        min_addr: int,
+        max_addr: int,
+        rd_perc: int,
+        data_limit: int,
+    ) -> List[StridedGeneratorCore]:
+        return [
+            StridedGeneratorCore(
+                duration=duration,
+                rate=rate,
+                block_size=block_size,
+                superblock_size=superblock_size,
+                stride_size=stride_size,
+                min_addr=min_addr,
+                max_addr=max_addr,
+                offset=i * superblock_size,
+                rd_perc=rd_perc,
+                data_limit=data_limit,
+            )
+            for i in range(num_cores)
+        ]
+
+    @overrides(AbstractGenerator)
+    def start_traffic(self) -> None:
+        for core in self.cores:
+            core.start_traffic()
diff --git a/src/python/gem5/components/processors/strided_generator_core.py b/src/python/gem5/components/processors/strided_generator_core.py
new file mode 100644
index 0000000000..debc945957
--- /dev/null
+++ b/src/python/gem5/components/processors/strided_generator_core.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2023 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import Iterator
+
+from m5.objects import (
+    BaseTrafficGen,
+    Port,
+    PyTrafficGen,
+)
+from m5.ticks import fromSeconds
+from m5.util.convert import (
+    toLatency,
+    toMemoryBandwidth,
+)
+
+from gem5.components.processors.abstract_core import AbstractCore
+from gem5.components.processors.abstract_generator_core import (
+    AbstractGeneratorCore,
+)
+from gem5.utils.override import overrides
+
+
+class StridedGeneratorCore(AbstractGeneratorCore):
+    def __init__(
+        self,
+        duration: str,
+        rate: str,
+        block_size: int,
+        superblock_size: int,
+        stride_size: int,
+        min_addr: int,
+        max_addr: int,
+        offset: int,
+        rd_perc: int,
+        data_limit: int,
+    ) -> None:
+        super().__init__()
+
+        self.generator = PyTrafficGen()
+        self._duration = duration
+        self._rate = rate
+        self._block_size = block_size
+        self._superblock_size = superblock_size
+        self._stride_size = stride_size
+        self._min_addr = min_addr
+        self._max_addr = max_addr
+        self._offset = offset
+        self._rd_perc = rd_perc
+        self._data_limit = data_limit
+
+    @overrides(AbstractCore)
+    def connect_dcache(self, port: Port) -> None:
+        self.generator.port = port
+
+    def _set_traffic(self) -> None:
+        self._traffic = self._create_traffic()
+
+    def _create_traffic(self) -> Iterator[BaseTrafficGen]:
+        duration = fromSeconds(toLatency(self._duration))
+        rate = toMemoryBandwidth(self._rate)
+        period = fromSeconds(self._block_size / rate)
+        min_period = period
+        max_period = period
+        yield self.generator.createStrided(
+            duration,
+            self._min_addr,
+            self._max_addr,
+            self._offset,
+            self._block_size,
+            self._superblock_size,
+            self._stride_size,
+            min_period,
+            max_period,
+            self._rd_perc,
+            self._data_limit,
+        )
+        yield self.generator.createExit(0)
+
+    @overrides(AbstractGeneratorCore)
+    def start_traffic(self) -> None:
+        self._set_traffic()
+        self.generator.start(self._traffic)
diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
index 0de69a40f2..570effaec7 100644
--- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
+++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
@@ -149,7 +149,9 @@ class RISCVMatchedBoard(
             # Contains a CLINT, PLIC, UART, and some functions for the dtb, etc.
             self.platform = HiFive()
             # Note: This only works with single threaded cores.
-            self.platform.plic.n_contexts = self.processor.get_num_cores() * 2
+            self.platform.plic.hart_config = ",".join(
+                ["MS" for _ in range(self.processor.get_num_cores())]
+            )
             self.platform.attachPlic()
             self.platform.clint.num_threads = self.processor.get_num_cores()
 
@@ -433,12 +435,19 @@ class RISCVMatchedBoard(
         plic_node.append(FdtPropertyWords("riscv,ndev", [plic.n_src - 1]))
 
         int_extended = list()
-        for i, core in enumerate(self.get_processor().get_cores()):
-            phandle = state.phandle(f"cpu@{i}.int_state")
-            int_extended.append(phandle)
-            int_extended.append(0xB)
-            int_extended.append(phandle)
-            int_extended.append(0x9)
+        cpu_id = 0
+        phandle = int_state.phandle(f"cpu@{cpu_id}.int_state")
+        for c in plic.hart_config:
+            if c == ",":
+                cpu_id += 1
+                assert cpu_id < self.get_processor().get_num_cores()
+                phandle = int_state.phandle(f"cpu@{cpu_id}.int_state")
+            elif c == "S":
+                int_extended.append(phandle)
+                int_extended.append(0x9)
+            elif c == "M":
+                int_extended.append(phandle)
+                int_extended.append(0xB)
 
         plic_node.append(FdtPropertyWords("interrupts-extended", int_extended))
         plic_node.append(FdtProperty("interrupt-controller"))
diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py
index 179e466e7a..451afe46ba 100644
--- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py
+++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py
@@ -76,10 +76,6 @@ class U74MiscFU(MinorDefaultMiscFU):
     pass
 
 
-class U74VecFU(MinorDefaultVecFU):
-    pass
-
-
 class U74FUPool(MinorFUPool):
     funcUnits = [
         U74IntFU(),
@@ -91,7 +87,6 @@ class U74FUPool(MinorFUPool):
         U74MemReadFU(),
         U74MemWriteFU(),
         U74MiscFU(),
-        U74VecFU(),
     ]
 
 
diff --git a/src/python/gem5/resources/client.py b/src/python/gem5/resources/client.py
index 1fecbb9c08..d45d73c499 100644
--- a/src/python/gem5/resources/client.py
+++ b/src/python/gem5/resources/client.py
@@ -24,13 +24,17 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import itertools
 import json
 import os
+import sys
 from pathlib import Path
 from typing import (
+    Any,
     Dict,
     List,
     Optional,
+    Tuple,
 )
 
 from m5.util import (
@@ -42,7 +46,9 @@ from _m5 import core
 
 from gem5.gem5_default_config import config
 
-from .client_api.client_wrapper import ClientWrapper
+from .client_api.atlasclient import AtlasClient
+from .client_api.client_query import ClientQuery
+from .client_api.jsonclient import JSONClient
 
 
 def getFileContent(file_path: Path) -> Dict:
@@ -125,7 +131,7 @@ def _get_clientwrapper():
                 f"Appending resources from {os.environ['GEM5_RESOURCE_JSON_APPEND']}"
             )
 
-        clientwrapper = ClientWrapper(gem5_config)
+        clientwrapper = _create_clients(gem5_config)
     return clientwrapper
 
 
@@ -143,7 +149,8 @@ def list_resources(
     :return: A Python Dict where the key is the resource id and the value is
              a list of all the supported resource versions.
     """
-    return _get_clientwrapper().list_resources(clients, gem5_version)
+    _get_clientwrapper()
+    return _list_all_resources(clients, gem5_version)
 
 
 def get_resource_json_obj(
@@ -163,7 +170,214 @@ def get_resource_json_obj(
                          current build. If ``None``, filtering based on compatibility
                          is not performed.
     """
+    _get_clientwrapper()
+    if resource_version:
+        client_queries = [
+            ClientQuery(resource_id, resource_version, gem5_version)
+        ]
+    else:
+        client_queries = [ClientQuery(resource_id, gem5_version=gem5_version)]
 
-    return _get_clientwrapper().get_resource_json_obj_from_client(
-        resource_id, resource_version, clients, gem5_version
+    # We will return a list when we refactor ontain_resources to handle multiple
+    # resources
+    return _get_resource_json_obj_from_client(client_queries, clients)[0]
+
+
+def get_multiple_resource_json_obj(
+    client_queries: List[ClientQuery],
+    clients: Optional[List[str]] = None,
+) -> List[Dict]:
+    """
+    Get the resource json object from the clients wrapper.
+
+    :param client_queries: This is a list of ClientQuery objects that contain
+                          information about the resources to fetch from datasources.
+    :param clients: The list of clients to query.
+    """
+    _get_clientwrapper()
+    return _get_resource_json_obj_from_client(client_queries, clients)
+
+
+def _create_clients(
+    config: Dict,
+) -> Dict:
+    """
+    This function creates respective client object for each source in the
+    config file according to the type of source.
+
+    :param config: config file containing the source information
+
+    :returns: clients: dictionary of clients for each source
+    """
+    clients = {}
+    for client in config["sources"]:
+        client_source = config["sources"][client]
+        try:
+            if client_source["isMongo"]:
+                clients[client] = AtlasClient(client_source)
+            else:
+                clients[client] = JSONClient(client_source["url"])
+        except Exception as e:
+            warn(f"Error creating client {client}: {str(e)}")
+    return clients
+
+
+def _list_all_resources(
+    clients: Optional[List[str]] = None,
+    gem5_version: Optional[str] = core.gem5Version,
+) -> Dict[str, List[str]]:
+    global clientwrapper
+    clients_to_search = (
+        list(clientwrapper.keys()) if clients is None else clients
+    )
+    # There's some duplications of functionality here (similar code in
+    # `get_all_resources_by_id`. This code could be refactored to avoid
+    # this).
+    resources = []
+    for client in clients_to_search:
+        if client not in clientwrapper:
+            raise Exception(f"Client: {client} does not exist")
+        try:
+            resources.extend(
+                clientwrapper[client].get_resources(gem5_version=gem5_version)
+            )
+        except Exception as e:
+            warn(f"Error getting resources from client {client}: {str(e)}")
+
+    to_return = {}
+    for resource in resources:
+        if resource["id"] not in to_return:
+            to_return[resource["id"]] = []
+        to_return[resource["id"]].append(resource["resource_version"])
+    return to_return
+
+
+def _get_resource_json_obj_from_client(
+    client_queries: List[ClientQuery],
+    clients: Optional[List[str]] = None,
+) -> Dict:
+    """
+    This function returns the resource object from the client with the
+    given id and version.
+
+    :param client_queries: This is a list of ClientQuery objects that contain
+                          information about the resources to fetch from datasources.
+    :param resource_version: The version of the resource to search for.
+    :param clients: A list of clients to search through. If ``None``, all
+                    clients are searched.
+    :param gem5_version: The gem5 version to check compatibility with. If
+                            ``None``, no compatibility check is performed. By
+                            default, is the current version of gem5.
+    :return: The resource object as a Python dictionary if found.
+                If not found, exception is thrown.
+    """
+    # getting all the resources with the given id from the dictionary
+    resources_list = _get_all_resources_by_id(client_queries, clients)
+
+    for id, resources in resources_list.items():
+        # if no resource with the given id is found, return None
+        if len(resources) == 0:
+            raise Exception(f"Resource with ID '{id}' not found.")
+
+    resource_to_return = []
+
+    # if there are multiple resources with the same id, return the one with
+    # the highest version
+    for id, resources in resources_list.items():
+        resources_list[id] = _sort_resources(resources)
+        resource_to_return.append(resources_list[id][0])
+
+    return resource_to_return
+
+
+def _get_all_resources_by_id(
+    client_queries: List[ClientQuery],
+    clients: Optional[List[str]] = None,
+) -> Dict[str, Any]:
+    """
+    This function returns all the resources with the given id from all the
+    sources.
+
+    :param client_queries: This is a list of ClientQuery objects that contain
+                          information about the resources to fetch from datasources.
+    :param clients: A list of clients to search through. If ``None``, all
+                    clients are searched.
+    :return: A list of resources as Python dictionaries.
+    """
+    global clientwrapper
+
+    # creating a dictionary with the resource id as the key and an empty
+    # list as the value, the list will be populated with different versions
+    # of the resource
+    resources = {}
+    for client_query in client_queries:
+        id = client_query.get_resource_id()
+        resources[id] = []
+
+    if not clients:
+        clients = list(clientwrapper.keys())
+    for client in clients:
+        if client not in clientwrapper:
+            raise Exception(f"Client: {client} does not exist")
+        try:
+            filtered_resources = clientwrapper[client].get_resources_by_id(
+                client_queries
+            )
+            for k in resources.keys():
+                if k in filtered_resources.keys():
+                    resources[k].append(filtered_resources[k])
+
+        except Exception as e:
+            print(
+                f"Exception thrown while getting resources '{client_queries}' "
+                f"from client '{client}'\n",
+                file=sys.stderr,
+            )
+            raise e
+    # check if no 2 resources have the same id and version
+    for resource_id, different_version_of_resource in resources.items():
+        for res1, res2 in itertools.combinations(
+            different_version_of_resource, 2
+        ):
+            if res1["resource_version"] == res2["resource_version"]:
+                raise Exception(
+                    f"Resource {resource_id} has multiple resources with "
+                    f"the same version: {res1['resource_version']}"
+                )
+
+    return resources
+
+
+def _sort_resources(resources: List) -> List:
+    """
+    Sorts the resources by ID.
+
+    If the IDs are the same, the resources are sorted by version.
+
+    :param resources: A list of resources to sort.
+
+    :return: A list of sorted resources.
+    """
+
+    def sort_tuple(resource: Dict) -> Tuple:
+        """This is used for sorting resources by ID and version. First
+        the ID is sorted, then the version. In cases where the version
+        contains periods, it's assumed this is to separate a
+        ``major.minor.hotfix`` style versioning system. In which case, the
+        value separated in the most-significant position is sorted before
+        those less significant. If the value is a digit it is cast as an
+        int, otherwise, it is cast as a string, to lower-case.
+        """
+        to_return = (resource["id"].lower(),)
+        for val in resource["resource_version"].split("."):
+            if val.isdigit():
+                to_return += (int(val),)
+            else:
+                to_return += (str(val).lower(),)
+        return to_return
+
+    return sorted(
+        resources,
+        key=lambda resource: sort_tuple(resource),
+        reverse=True,
     )
diff --git a/src/python/gem5/resources/client_api/abstract_client.py b/src/python/gem5/resources/client_api/abstract_client.py
index 2ec4a3cdb8..0dd84aba02 100644
--- a/src/python/gem5/resources/client_api/abstract_client.py
+++ b/src/python/gem5/resources/client_api/abstract_client.py
@@ -34,8 +34,11 @@ from typing import (
     Dict,
     List,
     Optional,
+    Tuple,
 )
 
+from .client_query import ClientQuery
+
 
 class AbstractClient(ABC):
     def _url_validator(self, url: str) -> bool:
@@ -55,13 +58,15 @@ class AbstractClient(ABC):
     @abstractmethod
     def get_resources(
         self,
-        resource_id: Optional[str] = None,
-        resource_version: Optional[str] = None,
-        gem5_version: Optional[str] = None,
+        client_queries: List[ClientQuery],
     ) -> List[Dict[str, Any]]:
         """
-        :param resource_id: The ID of the Resource. Optional, if not set, all
-                            resources will be returned.
+        :param client_queries: A list of client queries containing the
+                                information to query the resources. Each
+                                ClientQuery object can contain the following information:
+                                - resource_id: The ID of the Resource.
+                                - resource_version: The version of the `Resource`.
+                                - gem5_version: The version of gem5.
         :param resource_version: The version of the `Resource`. Optional, if
                                  not set, all resource versions will be returned.
                                  Note: If ``resource_id`` is not set, this
@@ -72,6 +77,40 @@ class AbstractClient(ABC):
         """
         raise NotImplementedError
 
+    def sort_resources(self, resources: List) -> List:
+        """
+        Sorts the resources by ID.
+
+        If the IDs are the same, the resources are sorted by version.
+
+        :param resources: A list of resources to sort.
+
+        :return: A list of sorted resources.
+        """
+
+        def sort_tuple(resource: Dict) -> Tuple:
+            """This is used for sorting resources by ID and version. First
+            the ID is sorted, then the version. In cases where the version
+            contains periods, it's assumed this is to separate a
+            ``major.minor.hotfix`` style versioning system. In which case, the
+            value separated in the most-significant position is sorted before
+            those less significant. If the value is a digit it is cast as an
+            int, otherwise, it is cast as a string, to lower-case.
+            """
+            to_return = (resource["id"].lower(),)
+            for val in resource["resource_version"].split("."):
+                if val.isdigit():
+                    to_return += (int(val),)
+                else:
+                    to_return += (str(val).lower(),)
+            return to_return
+
+        return sorted(
+            resources,
+            key=lambda resource: sort_tuple(resource),
+            reverse=True,
+        )
+
     def filter_incompatible_resources(
         self,
         resources_to_filter: List[Dict[str, Any]],
@@ -111,10 +150,17 @@ class AbstractClient(ABC):
                     filtered_resources.append(resource)
         return filtered_resources
 
-    def get_resources_by_id(self, resource_id: str) -> List[Dict[str, Any]]:
+    def get_resources_by_id(
+        self, client_queries: List[ClientQuery]
+    ) -> List[Dict[str, Any]]:
         """
-        :param resource_id: The ID of the Resource.
+        :param client_queries: A list of ClientQuery objects containing the
+                            information to query the resources. Each
+                            ClientQuery object can contain the following information:
+                            - resource_id: The ID of the Resource.
+                            - resource_version: The version of the `Resource`.
+                            - gem5_version: The version of gem5.
 
         :return: A list of all the Resources with the given ID.
         """
-        return self.get_resources(resource_id=resource_id)
+        return self.get_resources(client_queries=client_queries)
diff --git a/src/python/gem5/resources/client_api/atlasclient.py b/src/python/gem5/resources/client_api/atlasclient.py
index 1a4f9737c9..7dd5cd92b8 100644
--- a/src/python/gem5/resources/client_api/atlasclient.py
+++ b/src/python/gem5/resources/client_api/atlasclient.py
@@ -43,7 +43,9 @@ from urllib import (
 
 from m5.util import warn
 
+from ...utils.socks_ssl_context import get_proxy_context
 from .abstract_client import AbstractClient
+from .client_query import ClientQuery
 
 
 class AtlasClientHttpJsonRequestError(Exception):
@@ -133,7 +135,7 @@ class AtlasClient(AbstractClient):
 
         for attempt in itertools.count(start=1):
             try:
-                response = request.urlopen(req)
+                response = request.urlopen(req, context=get_proxy_context())
                 break
             except Exception as e:
                 if attempt >= max_failed_attempts:
@@ -155,24 +157,39 @@ class AtlasClient(AbstractClient):
 
     def get_resources(
         self,
-        resource_id: Optional[str] = None,
-        resource_version: Optional[str] = None,
-        gem5_version: Optional[str] = None,
-    ) -> List[Dict[str, Any]]:
+        client_queries: List[ClientQuery],
+    ) -> Dict[str, Any]:
         url = f"{self.url}/action/find"
         data = {
             "dataSource": self.dataSource,
             "collection": self.collection,
             "database": self.database,
         }
-        filter = {}
-        if resource_id:
-            filter["id"] = resource_id
-            if resource_version is not None:
-                filter["resource_version"] = resource_version
 
-        if filter:
-            data["filter"] = filter
+        search_conditions = []
+        for resource in client_queries:
+            condition = {
+                "id": resource.get_resource_id(),
+            }
+
+            if not resource.get_gem5_version().startswith("DEVELOP"):
+                # This is a regex search that matches the beginning of the
+                # string. So if the resource version is '20.1', it will
+                # match '20.1.1'.
+                condition["gem5_versions"] = {
+                    "$regex": f"^{resource.get_gem5_version()}",
+                    "$options": "i",
+                }
+
+            # If the resource has a resource_version, add it to the search
+            # conditions.
+            if resource.get_resource_version():
+                condition["resource_version"] = resource.get_resource_version()
+
+            search_conditions.append(condition)
+
+        filter = {"$or": search_conditions}
+        data["filter"] = filter
 
         headers = {
             "Authorization": f"Bearer {self.get_token()}",
@@ -186,8 +203,15 @@ class AtlasClient(AbstractClient):
             purpose_of_request="Get Resources",
         )["documents"]
 
-        # I do this as a lazy post-processing step because I can't figure out
-        # how to do this via an Atlas query, which may be more efficient.
-        return self.filter_incompatible_resources(
-            resources_to_filter=resources, gem5_version=gem5_version
-        )
+        resources_by_id = {}
+        for resource in resources:
+            if resource["id"] in resources_by_id.keys():
+                resources_by_id[resource["id"]].append(resource)
+            else:
+                resources_by_id[resource["id"]] = [resource]
+
+        # Sort the resources by version and return the latest version.
+        for id, resource_list in resources_by_id.items():
+            resources_by_id[id] = self.sort_resources(resource_list)[0]
+
+        return resources_by_id
diff --git a/src/python/gem5/resources/client_api/client_query.py b/src/python/gem5/resources/client_api/client_query.py
new file mode 100644
index 0000000000..88fb149d9f
--- /dev/null
+++ b/src/python/gem5/resources/client_api/client_query.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import Optional
+
+from _m5 import core
+
+"""
+This class is a data class that represents a query to the client.
+It encapsulates the fields required to query resources from the client.
+Right now, it only contains the resource_id, resource_version, and gem5_version
+fields, but it can be expanded to include more fields in the future, if needed.
+"""
+
+
+class ClientQuery:
+    def __init__(
+        self,
+        resource_id: str,
+        resource_version: Optional[str] = None,
+        gem5_version: Optional[str] = core.gem5Version,
+    ):
+        self.resource_id = resource_id
+        self.resource_version = resource_version
+        # We only need the major and minor version numbers.
+        # As on database side, we only store the major and minor
+        # version numbers.
+        self.gem5_version = ".".join(gem5_version.split(".")[:2])
+
+    def get_resource_id(self) -> str:
+        return self.resource_id
+
+    def get_resource_version(self) -> Optional[str]:
+        return self.resource_version
+
+    def get_gem5_version(self) -> Optional[str]:
+        return self.gem5_version
diff --git a/src/python/gem5/resources/client_api/client_wrapper.py b/src/python/gem5/resources/client_api/client_wrapper.py
deleted file mode 100644
index 4e93fc46b1..0000000000
--- a/src/python/gem5/resources/client_api/client_wrapper.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# Copyright (c) 2023 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import itertools
-import sys
-from typing import (
-    Dict,
-    List,
-    Optional,
-    Tuple,
-)
-
-from m5.util import warn
-
-from _m5 import core
-
-from .atlasclient import AtlasClient
-from .jsonclient import JSONClient
-
-
-class ClientWrapper:
-    def __init__(self, config):
-        self.clients = self.create_clients(config)
-
-    def create_clients(
-        self,
-        config: Dict,
-    ) -> Dict:
-        """
-        This function creates respective client object for each source in the
-        config file according to the type of source.
-
-        :param config: config file containing the source information
-
-        :returns: clients: dictionary of clients for each source
-        """
-        clients = {}
-        for client in config["sources"]:
-            client_source = config["sources"][client]
-            try:
-                if client_source["isMongo"]:
-                    clients[client] = AtlasClient(client_source)
-                else:
-                    clients[client] = JSONClient(client_source["url"])
-            except Exception as e:
-                warn(f"Error creating client {client}: {str(e)}")
-        return clients
-
-    def list_resources(
-        self,
-        clients: Optional[List[str]] = None,
-        gem5_version: Optional[str] = core.gem5Version,
-    ) -> Dict[str, List[str]]:
-        clients_to_search = (
-            list(self.clients.keys()) if clients is None else clients
-        )
-        # There's some duplications of functionality here (similar code in
-        # `get_all_resources_by_id`. This code could be refactored to avoid
-        # this).
-        resources = []
-        for client in clients_to_search:
-            if client not in self.clients:
-                raise Exception(f"Client: {client} does not exist")
-            try:
-                resources.extend(
-                    self.clients[client].get_resources(
-                        gem5_version=gem5_version
-                    )
-                )
-            except Exception as e:
-                warn(f"Error getting resources from client {client}: {str(e)}")
-
-        to_return = {}
-        for resource in resources:
-            if resource["id"] not in to_return:
-                to_return[resource["id"]] = []
-            to_return[resource["id"]].append(resource["resource_version"])
-        return to_return
-
-    def get_all_resources_by_id(
-        self,
-        resource_id: str,
-        clients: Optional[List[str]] = None,
-    ) -> List[Dict]:
-        """
-        This function returns all the resources with the given id from all the
-        sources.
-
-        :param resource_id: The id of the resource to search for.
-        :param clients: A list of clients to search through. If ``None``, all
-                        clients are searched.
-        :return: A list of resources as Python dictionaries.
-        """
-        resources = []
-        if not clients:
-            clients = list(self.clients.keys())
-        for client in clients:
-            if client not in self.clients:
-                raise Exception(f"Client: {client} does not exist")
-            try:
-                resources.extend(
-                    self.clients[client].get_resources_by_id(resource_id)
-                )
-            except Exception as e:
-                print(
-                    f"Exception thrown while getting resource '{resource_id}' "
-                    f"from client '{client}'\n",
-                    file=sys.stderr,
-                )
-                raise e
-        # check if no 2 resources have the same id and version
-        for res1, res2 in itertools.combinations(resources, 2):
-            if res1["resource_version"] == res2["resource_version"]:
-                raise Exception(
-                    f"Resource {resource_id} has multiple resources with "
-                    f"the same version: {res1['resource_version']}"
-                )
-        return resources
-
-    def get_resource_json_obj_from_client(
-        self,
-        resource_id: str,
-        resource_version: Optional[str] = None,
-        clients: Optional[List[str]] = None,
-        gem5_version: Optional[str] = core.gem5Version,
-    ) -> Dict:
-        """
-        This function returns the resource object from the client with the
-        given id and version.
-
-        :param resource_id: The id of the resource to search for.
-        :param resource_version: The version of the resource to search for.
-        :param clients: A list of clients to search through. If ``None``, all
-                        clients are searched.
-        :param gem5_version: The gem5 version to check compatibility with. If
-                             ``None``, no compatibility check is performed. By
-                             default, is the current version of gem5.
-        :return: The resource object as a Python dictionary if found.
-                 If not found, exception is thrown.
-        """
-        # getting all the resources with the given id from the dictionary
-        resources = self.get_all_resources_by_id(resource_id, clients)
-        # if no resource with the given id is found, return None
-        if len(resources) == 0:
-            raise Exception(f"Resource with ID '{resource_id}' not found.")
-
-        resource_to_return = None
-
-        if resource_version:
-            resource_to_return = self._search_version_in_resources(
-                resources, resource_id, resource_version
-            )
-
-        else:
-            compatible_resources = (
-                self._get_resources_compatible_with_gem5_version(
-                    resources, gem5_version=gem5_version
-                )
-            )
-            if len(compatible_resources) == 0:
-                resource_to_return = self._sort_resources(resources)[0]
-            else:
-                resource_to_return = self._sort_resources(
-                    compatible_resources
-                )[0]
-
-        if gem5_version:
-            self._check_resource_version_compatibility(
-                resource_to_return, gem5_version=gem5_version
-            )
-
-        return resource_to_return
-
-    def _search_version_in_resources(
-        self, resources: List, resource_id: str, resource_version: str
-    ) -> Dict:
-        """
-        Searches for the resource with the given version. If the resource is
-        not found, an exception is thrown.
-
-        :param resources: A list of resources to search through.
-        :param resource_version: The version of the resource to search for.
-
-        :return: The resource object as a Python dictionary if found.
-                 If not found, ``None`` is returned.
-        """
-        return_resource = next(
-            iter(
-                [
-                    resource
-                    for resource in resources
-                    if resource["resource_version"] == resource_version
-                ]
-            ),
-            None,
-        )
-        if not return_resource:
-            raise Exception(
-                f"Resource {resource_id} with version '{resource_version}'"
-                " not found.\nResource versions can be found at: "
-                "https://resources.gem5.org/"
-                f"resources/{resource_id}/versions"
-            )
-        return return_resource
-
-    def _get_resources_compatible_with_gem5_version(
-        self, resources: List, gem5_version: str = core.gem5Version
-    ) -> List:
-        """
-        Returns a list of compatible resources with the current gem5 version.
-
-        .. note::
-
-            This function assumes if the minor component of a resource's
-            gem5_version is not specified, it that the resource is compatible
-            all minor versions of the same major version.
-
-            Likewise, if no hot-fix component is specified, it is assumed that
-            the resource is compatible with all hot-fix versions of the same
-            minor version.
-
-        * '20.1' would be compatible with gem5 '20.1.1.0' and '20.1.2.0'.
-        * '21.5.2' would be compatible with gem5 '21.5.2.0' and '21.5.2.0'.
-        * '22.3.2.4' would only be compatible with gem5 '22.3.2.4'.
-
-        :param resources: A list of resources to filter.
-
-        :return: A list of compatible resources as Python dictionaries.
-
-        .. note::
-
-            This is a big duplication of code. This functionality already
-            exists in the `AbstractClient` class. This code should be refactored
-            to avoid this duplication.
-        """
-
-        compatible_resources = []
-        for resource in resources:
-            for version in resource["gem5_versions"]:
-                if gem5_version.startswith(version):
-                    compatible_resources.append(resource)
-        return compatible_resources
-
-    def _sort_resources(self, resources: List) -> List:
-        """
-        Sorts the resources by ID.
-
-        If the IDs are the same, the resources are sorted by version.
-
-        :param resources: A list of resources to sort.
-
-        :return: A list of sorted resources.
-        """
-
-        def sort_tuple(resource: Dict) -> Tuple:
-            """This is used for sorting resources by ID and version. First
-            the ID is sorted, then the version. In cases where the version
-            contains periods, it's assumed this is to separate a
-            ``major.minor.hotfix`` style versioning system. In which case, the
-            value separated in the most-significant position is sorted before
-            those less significant. If the value is a digit it is cast as an
-            int, otherwise, it is cast as a string, to lower-case.
-            """
-            to_return = (resource["id"].lower(),)
-            for val in resource["resource_version"].split("."):
-                if val.isdigit():
-                    to_return += (int(val),)
-                else:
-                    to_return += (str(val).lower(),)
-            return to_return
-
-        return sorted(
-            resources,
-            key=lambda resource: sort_tuple(resource),
-            reverse=True,
-        )
-
-    def _check_resource_version_compatibility(
-        self, resource: dict, gem5_version: Optional[str] = core.gem5Version
-    ) -> bool:
-        """
-        Checks if the resource is compatible with the gem5 version.
-
-        Prints a warning if the resource is not compatible.
-
-        :param resource: The resource to check.
-        :optional param gem5_version: The gem5 version to check
-                                      compatibility with.
-        :return: ``True`` if the resource is compatible, ``False`` otherwise.
-        """
-        if not resource:
-            return False
-        if (
-            gem5_version
-            and not gem5_version.upper().startswith("DEVELOP")
-            and not self._get_resources_compatible_with_gem5_version(
-                [resource], gem5_version=gem5_version
-            )
-        ):
-            if not gem5_version.upper().startswith("DEVELOP"):
-                warn(
-                    f"Resource {resource['id']} with version "
-                    f"{resource['resource_version']} is not known to be compatible"
-                    f" with gem5 version {gem5_version}. "
-                    "This may cause problems with your simulation. "
-                    "This resource's compatibility "
-                    "with different gem5 versions can be found here: "
-                    "https://resources.gem5.org"
-                    f"/resources/{resource['id']}/versions"
-                )
-            return False
-        return True
diff --git a/src/python/gem5/resources/client_api/jsonclient.py b/src/python/gem5/resources/client_api/jsonclient.py
index f242a788a9..4ef2ab2701 100644
--- a/src/python/gem5/resources/client_api/jsonclient.py
+++ b/src/python/gem5/resources/client_api/jsonclient.py
@@ -41,6 +41,7 @@ from urllib.error import URLError
 from m5.util import warn
 
 from .abstract_client import AbstractClient
+from .client_query import ClientQuery
 
 
 class JSONClient(AbstractClient):
@@ -75,25 +76,61 @@ class JSONClient(AbstractClient):
 
     def get_resources(
         self,
-        resource_id: Optional[str] = None,
-        resource_version: Optional[str] = None,
-        gem5_version: Optional[str] = None,
-    ) -> List[Dict[str, Any]]:
-        filter = self.resources  # Unfiltered.
-        if resource_id:
-            filter = [  # Filter by resource_id.
-                resource
-                for resource in filter
-                if resource["id"] == resource_id
-            ]
-            if resource_version:
-                filter = [  # Filter by resource_version.
-                    resource
-                    for resource in filter
-                    if resource["resource_version"] == resource_version
-                ]
+        client_queries: List[ClientQuery],
+    ) -> Dict[str, Any]:
+        def filter_resource(resource, client_queries):
+            for resource_query in client_queries:
+                gem5_version_match = False
+                resource_version_match = False
 
-        # Filter by gem5_version.
-        return self.filter_incompatible_resources(
-            resources_to_filter=filter, gem5_version=gem5_version
+                if (
+                    resource_query.get_gem5_version() is not None
+                    and not resource_query.get_gem5_version().startswith(
+                        "DEVELOP"
+                    )
+                ):
+                    gem5_version_match = (
+                        resource_query.get_gem5_version()
+                        in resource["gem5_versions"]
+                    )
+                else:
+                    gem5_version_match = True
+
+                if resource_query.get_resource_version() is not None:
+                    resource_version_match = (
+                        resource["resource_version"]
+                        == resource_query.get_resource_version()
+                    )
+                else:
+                    resource_version_match = True
+
+                resource_id_match = (
+                    resource_query.get_resource_id() == resource["id"]
+                )
+
+                if (
+                    gem5_version_match
+                    and resource_version_match
+                    and resource_id_match
+                ):
+                    return True
+
+            return False
+
+        filtered_resources = filter(
+            lambda resource: filter_resource(resource, client_queries),
+            self.resources,
         )
+
+        resources_by_id = {}
+        for resource in filtered_resources:
+            if resource["id"] in resources_by_id.keys():
+                resources_by_id[resource["id"]].append(resource)
+            else:
+                resources_by_id[resource["id"]] = [resource]
+
+        # Sort the resoruces by resoruce version and get the latest version.
+        for id, resource_list in resources_by_id.items():
+            resources_by_id[id] = self.sort_resources(resource_list)[0]
+
+        return resources_by_id
diff --git a/src/python/gem5/resources/downloader.py b/src/python/gem5/resources/downloader.py
index 1d495509ee..bb6d7109cd 100644
--- a/src/python/gem5/resources/downloader.py
+++ b/src/python/gem5/resources/downloader.py
@@ -48,6 +48,7 @@ from ..utils.progress_bar import (
     progress_hook,
     tqdm,
 )
+from ..utils.socks_ssl_context import get_proxy_context
 from .client import get_resource_json_obj
 from .client import list_resources as client_list_resources
 from .md5_utils import (
@@ -87,30 +88,13 @@ def _download(url: str, download_to: str, max_attempts: int = 6) -> None:
         # number of download attempts has been reached or if a HTTP status code
         # other than 408, 429, or 5xx is received.
         try:
-            # check to see if user requests a proxy connection
-            use_proxy = os.getenv("GEM5_USE_PROXY")
-            if use_proxy:
-                # If the "use_proxy" variable is specified we setup a socks5
-                # connection.
-
-                import socket
-                import ssl
-
-                import socks
-
-                IP_ADDR, host_port = use_proxy.split(":")
-                PORT = int(host_port)
-                socks.set_default_proxy(socks.SOCKS5, IP_ADDR, PORT)
-                socket.socket = socks.socksocket
-
-                # base SSL context for https connection
-                ctx = ssl.create_default_context()
-                ctx.check_hostname = False
-                ctx.verify_mode = ssl.CERT_NONE
-
+            proxy_context = get_proxy_context()
+            if proxy_context:
                 # get the file as a bytes blob
                 request = urllib.request.Request(url)
-                with urllib.request.urlopen(request, context=ctx) as fr:
+                with urllib.request.urlopen(
+                    request, context=proxy_context
+                ) as fr:
                     with tqdm.wrapattr(
                         open(download_to, "wb"),
                         "write",
diff --git a/src/python/gem5/resources/elfie.py b/src/python/gem5/resources/elfie.py
index d90930d3f0..840a664a97 100644
--- a/src/python/gem5/resources/elfie.py
+++ b/src/python/gem5/resources/elfie.py
@@ -24,7 +24,10 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from typing import List
+from typing import (
+    List,
+    Optional,
+)
 
 from m5.objects import PcCountTrackerManager
 from m5.params import PcCountPair
@@ -36,9 +39,31 @@ class ELFieInfo:
     See https://github.com/intel/pinball2elf for more information.
     """
 
-    def __init__(self, start: PcCountPair, end: PcCountPair):
+    def __init__(
+        self,
+        start_pc: Optional[str] = None,
+        end_pc: Optional[str] = None,
+        start_pc_count: Optional[str] = None,
+        end_pc_count: Optional[str] = None,
+        start: Optional["PcCountPair"] = None,
+        end: Optional["PcCountPair"] = None,
+        **kwargs
+    ):
         self._start = start
         self._end = end
+
+        if self._start is None:
+            if start_pc is None or start_pc_count is None:
+                raise ValueError(
+                    "start_pc and start_pc_count must be provided"
+                )
+            self._start = PcCountPair(int(start_pc, 16), int(start_pc_count))
+
+        if self._end is None:
+            if end_pc is None or end_pc_count is None:
+                raise ValueError("end_pc and end_pc_count must be provided")
+            self._end = PcCountPair(int(end_pc, 16), int(end_pc_count))
+
         self._manager = PcCountTrackerManager()
         self._manager.targets = self.get_targets()
 
diff --git a/src/python/gem5/resources/resource.py b/src/python/gem5/resources/resource.py
index 591515a6b9..eb3ca78fe9 100644
--- a/src/python/gem5/resources/resource.py
+++ b/src/python/gem5/resources/resource.py
@@ -24,7 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import json
 import os
 from abc import ABCMeta
 from functools import partial
@@ -52,8 +51,13 @@ from ..isas import (
     ISA,
     get_isa_from_str,
 )
-from .client import get_resource_json_obj
+from .client import (
+    get_multiple_resource_json_obj,
+    get_resource_json_obj,
+)
+from .client_api.client_query import ClientQuery
 from .downloader import get_resource
+from .elfie import ELFieInfo
 from .looppoint import (
     LooppointCsvLoader,
     LooppointJsonLoader,
@@ -481,27 +485,48 @@ class SimpointResource(AbstractResource):
         self._warmup_interval = warmup_interval
         self._workload_name = workload_name
 
+        self._simpoint_start_insts = None
+
+    def _load_simpoints(self) -> None:
+        """As we cache downloading of resources until we require it, we may
+        not pass the simpoint data in the constructor. In this case we enforce
+        that the simpoint data is loaded via ths `_load_simpoints` function.
+        Ergo when functions like `get_simpoint_list` are called, the data is
+        loaded.
+        """
         self._simpoint_start_insts = list(
-            inst * simpoint_interval for inst in self.get_simpoint_list()
+            inst * self.get_simpoint_interval()
+            for inst in self.get_simpoint_list()
         )
 
-        if self._warmup_interval != 0:
+        if self.get_warmup_interval() != 0:
             self._warmup_list = self._set_warmup_list()
         else:
             self._warmup_list = [0] * len(self.get_simpoint_start_insts)
 
     def get_simpoint_list(self) -> List[int]:
         """Returns the a list containing all the SimPoints for the workload."""
+        if self._simpoint_list is None:
+            self._load_simpoints()
+        assert self._simpoint_list is not None, "SimPoint list is None"
         return self._simpoint_list
 
     def get_simpoint_start_insts(self) -> List[int]:
         """Returns a lst containing all the SimPoint starting instrunction
         points for the workload. This was calculated by multiplying the
         SimPoint with the SimPoint interval when it was generated."""
+        if self._simpoint_start_insts is None:
+            self._load_simpoints()
+        assert (
+            self._simpoint_start_insts is not None
+        ), "SimPoint start insts is None"
         return self._simpoint_start_insts
 
     def get_warmup_interval(self) -> int:
         """Returns the instruction length of the warmup interval."""
+        if self._warmup_interval is None:
+            self._load_simpoints()
+        assert self._warmup_interval is not None, "Warmup interval is None"
         return self._warmup_interval
 
     def get_weight_list(self) -> List[float]:
@@ -509,6 +534,9 @@ class SimpointResource(AbstractResource):
         order of the weights matches that of the list returned by
         ``get_simpoint_list()``. I.e. ``get_weight_list()[3]`` is the weight for
         SimPoint ``get_simpoint_list()[3]``."""
+        if self._weight_list is None:
+            self._load_simpoints()
+        assert self._weight_list is not None, "Weight list is None"
         return self._weight_list
 
     def get_simpoint_interval(self) -> int:
@@ -520,6 +548,9 @@ class SimpointResource(AbstractResource):
         Each warmup length in this list corresponds to the SimPoint at the same
         index in ``get_simpoint_list()``. I.e., ``get_warmup_list()[4]`` is the
         warmup length for SimPoint ``get_simpoint_list()[4]``."""
+        if self._warmup_list is None:
+            self._load_simpoints()
+        assert self._warmup_list is not None, "Warmup list is None"
         return self._warmup_list
 
     def get_workload_name(self) -> Optional[str]:
@@ -644,20 +675,8 @@ class SimpointDirectoryResource(SimpointResource):
         self._simpoint_file = simpoint_file
         self._weight_file = weight_file
 
-        # This is a little hack. The functions `get_simpoint_file` and
-        # `get_weight_file` use the local path, so we set it here despite it
-        # also being set in the `AbstractResource` constructor. This isn't
-        # elegant but does not harm.
-        self._local_path = local_path
-        (
-            simpoint_list,
-            weight_list,
-        ) = self._get_weights_and_simpoints_from_file()
-
         super().__init__(
             simpoint_interval=simpoint_interval,
-            simpoint_list=simpoint_list,
-            weight_list=weight_list,
             warmup_interval=warmup_interval,
             workload_name=workload_name,
             local_path=local_path,
@@ -668,13 +687,22 @@ class SimpointDirectoryResource(SimpointResource):
             resource_version=resource_version,
         )
 
+    def _load_simpoints(self) -> None:
+        (
+            simpoint_list,
+            weight_list,
+        ) = self._get_weights_and_simpoints_from_file()
+        self._simpoint_list = simpoint_list
+        self._weight_list = weight_list
+        super()._load_simpoints()
+
     def get_simpoint_file(self) -> Path:
         """Return the SimPoint File path."""
-        return Path(Path(self._local_path) / self._simpoint_file)
+        return Path(Path(self.get_local_path()) / self._simpoint_file)
 
     def get_weight_file(self) -> Path:
         """Returns the Weight File path."""
-        return Path(Path(self._local_path) / self._weight_file)
+        return Path(Path(self.get_local_path()) / self._weight_file)
 
     def _get_weights_and_simpoints_from_file(
         self,
@@ -730,8 +758,10 @@ class SuiteResource(AbstractResource):
         **kwargs,
     ) -> None:
         """
-        :param workloads: A list of ``WorkloadResource`` objects
-                          created from the ``_workloads`` parameter.
+        :param workloads: A Dict of Tuples containing the WorkloadResource
+                          object as the key and a set of input groups as the
+                          value. This Dict is created from the ``_workloads``
+                          parameter.
         :param local_path: The path on the host system where this resource is
                            located.
         :param description: Description describing this resource. Not a
@@ -810,46 +840,6 @@ class SuiteResource(AbstractResource):
         }
 
 
-class ShadowResource(AbstractResource):
-    """A special resource class which delays the `obtain_resource` call. It is,
-    in a sense, half constructed. Only when a function or attribute is called
-    which is is neither `get_id` or `get_resource_version` does this class
-    fully construct itself by calling the `obtain_resource_call` partial
-    function.
-
-    **Note:** This class is a hack. The ideal solution to this would be to
-    enable the bundled obtaining of resources in the gem5 Standard Library.
-    Use of the class is discouraged and should not be depended on. Issue
-    https://github.com/gem5/gem5/issues/644 is tracking the implementation of
-    an alternative.
-    """
-
-    def __init__(
-        self,
-        id: str,
-        resource_version: str,
-        obtain_resource_call: partial,
-    ):
-        super().__init__(
-            id=id,
-            resource_version=resource_version,
-        )
-        self._workload: Optional[AbstractResource] = None
-        self._obtain_resource_call = obtain_resource_call
-
-    def __getattr__(self, attr):
-        """if getting the id or resource version, we keep the object in the
-        "shdow state" where the `obtain_resource` function has not been called.
-        When more information is needed by calling another attribute, we call
-        the `obtain_resource` function and store the result in the `_workload`.
-        """
-        if attr in {"get_id", "get_resource_version"}:
-            return getattr(super(), attr)
-        if not self._workload:
-            self._workload = self._obtain_resource_call()
-        return getattr(self._workload, attr)
-
-
 class WorkloadResource(AbstractResource):
     """A workload resource. This resource is used to specify a workload to run
     on a board. It contains the function to call and the parameters to pass to
@@ -972,6 +962,237 @@ def obtain_resource(
         gem5_version=gem5_version,
     )
 
+    to_path, downloader = _get_to_path_and_downloader_partial(
+        resource_json=resource_json,
+        to_path=to_path,
+        resource_directory=resource_directory,
+        download_md5_mismatch=download_md5_mismatch,
+        clients=clients,
+        gem5_version=gem5_version,
+        quiet=quiet,
+    )
+
+    # Obtain the type from the JSON. From this we will determine what subclass
+    # of `AbstractResource` we are to create and return.
+    resources_category = resource_json["category"]
+
+    if resources_category == "resource":
+        # This is a stop-gap measure to ensure to work with older versions of
+        # the "resource.json" file. These should be replaced with their
+        # respective specializations ASAP and this case removed.
+        if "root_partition" in resource_json:
+            # In this case we should return a DiskImageResource.
+            root_partition = resource_json["root_partition"]
+            return DiskImageResource(
+                local_path=to_path,
+                root_partition=root_partition,
+                downloader=downloader,
+                **resource_json,
+            )
+        return CustomResource(local_path=to_path, downloader=downloader)
+
+    assert resources_category in _get_resource_json_type_map
+    resource_class = _get_resource_json_type_map[resources_category]
+
+    if resources_category == "suite":
+        return _get_suite(
+            resource_json,
+            to_path,
+            resource_directory,
+            download_md5_mismatch,
+            clients,
+            gem5_version,
+            quiet,
+        )
+    if resources_category == "workload":
+        # This parses the "resources" and "additional_params" fields of the
+        # workload resource into a dictionary of AbstractResource objects and
+        # strings respectively.
+        return _get_workload(
+            resource_json,
+            to_path,
+            resource_directory,
+            download_md5_mismatch,
+            clients,
+            gem5_version,
+            quiet,
+        )
+    # Once we know what AbstractResource subclass we are using, we create it.
+    # The fields in the JSON object are assumed to map like-for-like to the
+    # subclass contructor, so we can pass the resource_json map directly.
+    return resource_class(
+        local_path=to_path, downloader=downloader, **resource_json
+    )
+
+
+def _get_suite(
+    suite: Dict[str, Any],
+    local_path: str,
+    resource_directory: str,
+    download_md5_mismatch: bool,
+    clients: List[str],
+    gem5_version: str,
+    quiet: bool,
+) -> SuiteResource:
+    """
+    :param suite: The suite JSON object.
+    :param local_path: The local path of the suite.
+    :param resource_directory: The resource directory.
+    :param download_md5_mismatch: If the resource is present, but does not have
+                                  the correct md5 value, the resource will be
+                                  deleted and re-downloaded if this value is ``True``.
+                                  Otherwise an exception will be thrown.
+    :param clients: A list of clients to search for the resource. If this
+                    parameter is not set, it will default search all clients.
+    :param gem5_version: The gem5 version to use to filter incompatible
+                         resource versions. By default set to the current gem5
+                         version.
+    :param quiet: If ``True``, suppress output. ``False`` by default.
+    """
+    # Mapping input groups to workload IDs
+    id_input_group_dict = {}
+    for workload in suite["workloads"]:
+        id_input_group_dict[workload["id"]] = workload["input_group"]
+
+    # Fetching the workload resources as a list of dicts
+    db_query = [
+        ClientQuery(
+            resource_id=resource_info["id"],
+            resource_version=resource_info["resource_version"],
+            gem5_version=gem5_version,
+        )
+        for resource_info in suite["workloads"]
+    ]
+    workload_json = get_multiple_resource_json_obj(db_query, clients)
+
+    # Creating the workload resource objects for each workload
+    # and setting the input group for each workload
+    workload_input_group_dict = {}
+    for workload in workload_json:
+        workload_input_group_dict[
+            _get_workload(
+                workload,
+                local_path,
+                resource_directory,
+                download_md5_mismatch,
+                clients,
+                gem5_version,
+                quiet,
+            )
+        ] = id_input_group_dict[workload["id"]]
+
+    suite["workloads"] = workload_input_group_dict
+    return SuiteResource(
+        local_path=local_path,
+        downloader=None,
+        **suite,
+    )
+
+
+def _get_workload(
+    workload: Dict[str, Any],
+    local_path: str,
+    resource_directory: str,
+    download_md5_mismatch: bool,
+    clients: List[str],
+    gem5_version: str,
+    quiet: bool,
+) -> WorkloadResource:
+    """
+    :param workload: The workload JSON object.
+    :param local_path: The local path of the workload.
+    :param resource_directory: The resource directory.
+    :param download_md5_mismatch: If the resource is present, but does not have
+                                  the correct md5 value, the resource will be
+                                  deleted and re-downloaded if this value is ``True``.
+                                  Otherwise an exception will be thrown.
+    :param clients: A list of clients to search for the resource. If this
+                    parameter is not set, it will default search all clients.
+    :param gem5_version: The gem5 version to use to filter incompatible
+                         resource versions. By default set to the current gem5
+                         version.
+    :param quiet: If ``True``, suppress output. ``False`` by default.
+    """
+    params = {}
+
+    db_query = []
+    for resource in workload["resources"].values():
+        db_query.append(
+            ClientQuery(
+                resource_id=resource["id"],
+                resource_version=resource["resource_version"],
+                gem5_version=gem5_version,
+            )
+        )
+    # Fetching resources as a list of dicts
+    resource_details_list = get_multiple_resource_json_obj(db_query, clients)
+
+    # Creating the resource objects for each resource
+    for param_name, param_resource in workload["resources"].items():
+        resource_match = None
+        for resource in resource_details_list:
+            if (
+                param_resource["id"] == resource["id"]
+                and param_resource["resource_version"]
+                == resource["resource_version"]
+            ):
+                resource_match = resource
+                break
+
+        if resource_match is None:
+            raise Exception(
+                f"Resource {param_resource['id']} with version {param_resource['resource_version']} not found"
+            )
+        assert isinstance(param_name, str)
+        to_path, downloader = _get_to_path_and_downloader_partial(
+            resource_json=resource_match,
+            to_path=local_path,
+            resource_directory=resource_directory,
+            download_md5_mismatch=download_md5_mismatch,
+            clients=clients,
+            gem5_version=gem5_version,
+            quiet=quiet,
+        )
+
+        resource_class = _get_resource_json_type_map[
+            resource_match["category"]
+        ]
+
+        params[param_name] = resource_class(
+            local_path=to_path,
+            downloader=downloader,
+            **resource,
+        )
+
+        # Adding the additional parameters to the workload parameters
+        if (
+            "additional_params" in workload.keys()
+            and workload["additional_params"]
+        ):
+            for key in workload["additional_params"].keys():
+                assert isinstance(key, str)
+                value = workload["additional_params"][key]
+                params[key] = value
+
+    return WorkloadResource(
+        local_path=local_path,
+        downloader=None,
+        parameters=params,
+        **workload,
+    )
+
+
+def _get_to_path_and_downloader_partial(
+    resource_json: Dict[str, str],
+    to_path: str,
+    resource_directory: str,
+    download_md5_mismatch: bool,
+    clients: List[str],
+    gem5_version: str,
+    quiet: bool,
+) -> Tuple[str, Optional[partial]]:
+    resource_id = resource_json["id"]
+    resource_version = resource_json["resource_version"]
     # This is is used to store the partial function which is used to download
     # the resource when the `get_local_path` function is called.
     downloader: Optional[partial] = None
@@ -1035,79 +1256,7 @@ def obtain_resource(
             gem5_version=gem5_version,
             quiet=quiet,
         )
-
-    # Obtain the type from the JSON. From this we will determine what subclass
-    # of `AbstractResource` we are to create and return.
-    resources_category = resource_json["category"]
-
-    if resources_category == "resource":
-        # This is a stop-gap measure to ensure to work with older versions of
-        # the "resource.json" file. These should be replaced with their
-        # respective specializations ASAP and this case removed.
-        if "root_partition" in resource_json:
-            # In this case we should return a DiskImageResource.
-            root_partition = resource_json["root_partition"]
-            return DiskImageResource(
-                local_path=to_path,
-                root_partition=root_partition,
-                downloader=downloader,
-                **resource_json,
-            )
-        return CustomResource(local_path=to_path, downloader=downloader)
-
-    assert resources_category in _get_resource_json_type_map
-    resource_class = _get_resource_json_type_map[resources_category]
-
-    if resources_category == "suite":
-        workloads = resource_json["workloads"]
-        workloads_obj = {}
-        for workload in workloads:
-            workloads_obj[
-                ShadowResource(
-                    id=workload["id"],
-                    resource_version=workload["resource_version"],
-                    obtain_resource_call=partial(
-                        obtain_resource,
-                        workload["id"],
-                        resource_version=workload["resource_version"],
-                        resource_directory=resource_directory,
-                        clients=clients,
-                        gem5_version=gem5_version,
-                    ),
-                )
-            ] = set(workload["input_group"])
-        resource_json["workloads"] = workloads_obj
-
-    if resources_category == "workload":
-        # This parses the "resources" and "additional_params" fields of the
-        # workload resource into a dictionary of AbstractResource objects and
-        # strings respectively.
-        params = {}
-        if "resources" in resource_json:
-            for key in resource_json["resources"].keys():
-                assert isinstance(key, str)
-                value = resource_json["resources"][key]
-
-                assert isinstance(value, dict)
-                params[key] = obtain_resource(
-                    value["id"],
-                    resource_version=value["resource_version"],
-                    resource_directory=resource_directory,
-                    clients=clients,
-                    gem5_version=gem5_version,
-                )
-        if "additional_params" in resource_json:
-            for key in resource_json["additional_params"].keys():
-                assert isinstance(key, str)
-                value = resource_json["additional_params"][key]
-                params[key] = value
-        resource_json["parameters"] = params
-    # Once we know what AbstractResource subclass we are using, we create it.
-    # The fields in the JSON object are assumed to map like-for-like to the
-    # subclass contructor, so we can pass the resource_json map directly.
-    return resource_class(
-        local_path=to_path, downloader=downloader, **resource_json
-    )
+    return to_path, downloader
 
 
 def _get_default_resource_dir() -> str:
@@ -1263,4 +1412,5 @@ _get_resource_json_type_map = {
     "looppoint-json": LooppointJsonResource,
     "suite": SuiteResource,
     "workload": WorkloadResource,
+    "elfie-info": ELFieInfo,
 }
diff --git a/src/python/gem5/simulate/exit_event.py b/src/python/gem5/simulate/exit_event.py
index b902643a3f..5a0bb3d1d7 100644
--- a/src/python/gem5/simulate/exit_event.py
+++ b/src/python/gem5/simulate/exit_event.py
@@ -39,6 +39,7 @@ class ExitEvent(Enum):
     EXIT = "exit"  # A standard vanilla exit.
     WORKBEGIN = "workbegin"  # An exit because a ROI has been reached.
     WORKEND = "workend"  # An exit because a ROI has ended.
+    SPATTER_EXIT = "spatter exit"  # An exit because a spatter core has ended.
     SWITCHCPU = "switchcpu"  # An exit needed to switch CPU cores.
     FAIL = "fail"  # An exit because the simulation has failed.
     CHECKPOINT = "checkpoint"  # An exit to load a checkpoint.
@@ -115,6 +116,8 @@ class ExitEvent(Enum):
         elif exit_string.endswith("is finished updating the memory.\n"):
             # This is for the gups generator exit event
             return ExitEvent.EXIT
+        elif exit_string.endswith("received all expected responses."):
+            return ExitEvent.SPATTER_EXIT
         raise NotImplementedError(
             f"Exit event '{exit_string}' not implemented"
         )
diff --git a/src/python/gem5/simulate/exit_event_generators.py b/src/python/gem5/simulate/exit_event_generators.py
index 4d18b4cee0..b237b064e2 100644
--- a/src/python/gem5/simulate/exit_event_generators.py
+++ b/src/python/gem5/simulate/exit_event_generators.py
@@ -36,6 +36,7 @@ from m5.util import warn
 from gem5.resources.looppoint import Looppoint
 
 from ..components.processors.abstract_processor import AbstractProcessor
+from ..components.processors.spatter_gen import SpatterGenerator
 from ..components.processors.switchable_processor import SwitchableProcessor
 from ..resources.resource import SimpointResource
 
@@ -221,3 +222,9 @@ def looppoint_save_checkpoint_generator(
         yield False
 
     yield True
+
+
+def spatter_exit_generator(spatter_gen: SpatterGenerator):
+    while True:
+        assert isinstance(spatter_gen, SpatterGenerator)
+        yield from spatter_gen.handle_spatter_exit()
diff --git a/src/python/gem5/simulate/simulator.py b/src/python/gem5/simulate/simulator.py
index 708a484ead..3ca4f107b8 100644
--- a/src/python/gem5/simulate/simulator.py
+++ b/src/python/gem5/simulate/simulator.py
@@ -26,6 +26,7 @@
 
 import os
 import sys
+from io import StringIO
 from pathlib import Path
 from typing import (
     Callable,
@@ -52,6 +53,8 @@ from .exit_event_generators import (
     exit_generator,
     reset_stats_generator,
     save_checkpoint_generator,
+    skip_generator,
+    spatter_exit_generator,
     switch_generator,
     warn_default_decorator,
 )
@@ -104,6 +107,8 @@ class Simulator:
         ] = None,
         expected_execution_order: Optional[List[ExitEvent]] = None,
         checkpoint_path: Optional[Path] = None,
+        max_ticks: Optional[int] = m5.MaxTick,
+        id: Optional[int] = None,
     ) -> None:
         """
         :param board: The board to be simulated.
@@ -138,6 +143,22 @@ class Simulator:
                                 the path is ``None``. **This parameter is deprecated.
                                 Please set the checkpoint when setting the board's
                                 workload**.
+        :param max_ticks: The maximum number of ticks to execute  in the
+                          simulation run before exiting with a ``MAX_TICK``
+                          exit event. If not set this value is to `m5.MaxTick`,
+                          the last value allowed in the tick variable. At
+                          present this is an unsigned 64-bit integer, and
+                          herefore is set to 2^4-1. Prior to intialization,
+                          max tickks can also be set via the `set_max_ticks`
+                          function.
+        :param id: An optional parameter specifying the ID of the simulation.
+        This is particularly useful when running muliple simuations in
+        parallel. The ID can be unique and descriptive of the simulation. If
+        not set, the ID will be a hash of the instantiated system and
+        Simulator configuration. Note, the latter means the ID only available
+        after the Simulator has been instantiated. The ID can be obtained via
+        the `get_id` method.
+
 
         ``on_exit_event`` usage notes
         ---------------------------
@@ -270,6 +291,11 @@ class Simulator:
 
         """
 
+        self.set_max_ticks(max_ticks)
+
+        if id:
+            self.set_id(id)
+
         # We specify a dictionary here outlining the default behavior for each
         # exit event. Each exit event is mapped to a generator.
         self._default_on_exit_dict = {
@@ -280,6 +306,12 @@ class Simulator:
                 "creating a checkpoint and continuing",
             )(),
             ExitEvent.FAIL: exit_generator(),
+            ExitEvent.SPATTER_EXIT: warn_default_decorator(
+                spatter_exit_generator,
+                "spatter exit",
+                "dumping and resetting stats after each sync point. "
+                "Note that there will be num_cores*sync_points spatter_exits.",
+            )(spatter_gen=board.get_processor()),
             ExitEvent.SWITCHCPU: warn_default_decorator(
                 switch_generator,
                 "switch CPU",
@@ -299,7 +331,7 @@ class Simulator:
             ExitEvent.MAX_TICK: exit_generator(),
             ExitEvent.SCHEDULED_TICK: exit_generator(),
             ExitEvent.SIMPOINT_BEGIN: warn_default_decorator(
-                reset_stats_generator,
+                skip_generator,
                 "simpoint begin",
                 "resetting the stats and continuing",
             )(),
@@ -359,6 +391,66 @@ class Simulator:
 
         self._checkpoint_path = checkpoint_path
 
+    def set_id(self, id: str) -> None:
+        """Set the ID of the simulator.
+
+        As, in the caae of multisim, this ID will be used to create an
+        output subdirectory, there needs to be rules on what an ID can be.
+        For now, this function encoures that IDs can only be alphanumeric
+        characters with underscores  and dashes. Uunderscores and dashes cannot
+        be at the start or end of the ID and  the ID must start with at least
+        one letter.
+
+        :param id: The ID of the simulator.
+        """
+
+        if not id:
+            raise ValueError("ID cannot be an empty string.")
+
+        if not id[0].isalpha():
+            raise ValueError("ID must start with a letter.")
+
+        if not id[-1].isalnum():
+            raise ValueError(
+                "ID must end with a alphanumeric value (a digit "
+                "or a character)."
+            )
+
+        if not all(char.isalnum() or char in ["_", "-"] for char in id):
+            raise ValueError(
+                "ID can only contain alphanumeric characters, "
+                "underscores, and dashes."
+            )
+        self._id = id
+
+    def get_id(self) -> Optional[str]:
+        """
+        Returns the ID of the simulation. This is particularly useful when
+        running multiple simulations in parallel. The ID can be unique and
+        descriptive of the simulation. It is set via the contructor or the
+        `set_id` function. None if not set by either.
+        """
+
+        if hasattr(self, "_id") and self._id:
+            return self._id
+
+        return None
+
+    def set_max_ticks(self, max_tick: int) -> None:
+        """Set the absolute (not relative) maximum number of ticks to run the
+        simulation for. This is the maximum number of ticks to run the
+        simulation for before exiting with a ``MAX_TICK`` exit event.
+        """
+        if max_tick > m5.MaxTick:
+            raise ValueError(
+                f"Max ticks must be less than {m5.MaxTick}, not {max_tick}"
+            )
+        self._max_ticks = max_tick
+
+    def get_max_ticks(self) -> int:
+        assert hasattr(self, "_max_ticks"), "Max ticks not set"
+        return self._max_ticks
+
     def schedule_simpoint(self, simpoint_start_insts: List[int]) -> None:
         """
         Schedule ``SIMPOINT_BEGIN`` exit events
@@ -470,6 +562,12 @@ class Simulator:
         """
         return self._last_exit_event.getCause()
 
+    def get_last_exit_event_code(self) -> int:
+        """
+        Returns the last exit event status code
+        """
+        return self._last_exit_event.getCode()
+
     def get_current_tick(self) -> int:
         """
         Returns the current tick.
@@ -498,6 +596,38 @@ class Simulator:
 
         return to_return
 
+    def override_outdir(self, new_outdir: Path) -> None:
+        """This function can be used to override the output directory locatiomn
+        Assiming the path passed is valid, the directory will be created
+        and set as the new output directory, thus overriding what was set at
+        the gem5 command line. Is there fore advised this function is used with
+        caution. Its primary use is for swaning multiple gem5 processes from
+        a gem5 process to allow the child processes their own output directory.
+
+        :param new_outdir: The new output directory to be used instead of that
+                           set at the gem5 command line.
+        """
+
+        if self._instantiated:
+            raise Exception(
+                "Cannot override the output directory after the simulation "
+                "has been instantiated."
+            )
+        from m5 import options
+
+        from _m5.core import setOutputDir
+
+        new_outdir.mkdir(parents=True, exist_ok=True)
+
+        if not new_outdir.exists():
+            raise Exception(f"Directory '{new_outdir}' does not exist")
+
+        if not new_outdir.is_dir():
+            raise Exception(f"'{new_outdir}' is not a directory")
+
+        options.outdir = str(new_outdir)
+        setOutputDir(options.outdir)
+
     def _instantiate(self) -> None:
         """
         This method will instantiate the board and carry out necessary
@@ -511,9 +641,11 @@ class Simulator:
             self._board._pre_instantiate()
 
             root = Root(
-                full_system=self._full_system
-                if self._full_system is not None
-                else self._board.is_fullsystem(),
+                full_system=(
+                    self._full_system
+                    if self._full_system is not None
+                    else self._board.is_fullsystem()
+                ),
                 board=self._board,
             )
 
@@ -557,7 +689,7 @@ class Simulator:
             # any final things.
             self._board._post_instantiate()
 
-    def run(self, max_ticks: int = m5.MaxTick) -> None:
+    def run(self, max_ticks: Optional[int] = None) -> None:
         """
         This function will start or continue the simulator run and handle exit
         events accordingly.
@@ -566,9 +698,17 @@ class Simulator:
                           run. If this ``max_ticks`` value is met, a ``MAX_TICK``
                           exit event is received, if another simulation exit
                           event is met the tick count is reset. This is the
-                          **maximum number of ticks per simulation run**.
+                          **maximum number of ticks per simulation run.
         """
 
+        if max_ticks and max_ticks != self._max_ticks:
+            warn(
+                "Max ticks has already been set prior to setting it through "
+                "the run call. In these cases the max ticks set through the "
+                "`run` function is used"
+            )
+            self.set_max_ticks(max_ticks)
+
         # Check to ensure no banned module has been imported.
         for banned_module in self._banned_modules.keys():
             if banned_module in sys.modules:
@@ -583,7 +723,7 @@ class Simulator:
 
         # This while loop will continue until an a generator yields True.
         while True:
-            self._last_exit_event = m5.simulate(max_ticks)
+            self._last_exit_event = m5.simulate(self.get_max_ticks())
 
             # Translate the exit event cause to the exit event enum.
             exit_enum = ExitEvent.translate_exit_status(
diff --git a/src/python/gem5/utils/multiprocessing/README.md b/src/python/gem5/utils/multiprocessing/README.md
index c6b0406e54..51c3d6edc0 100644
--- a/src/python/gem5/utils/multiprocessing/README.md
+++ b/src/python/gem5/utils/multiprocessing/README.md
@@ -10,10 +10,36 @@ The goal of this code is to enable users to use a *single* set of python scripts
 We must reimplement some of the multiprocessing module because it is not flexible enough to allow for customized command line parameter to the "python" executable (gem5 in our case).
 To get around this, I extended the Process and context objects to be gem5 specific.
 
-The next steps is to wrap the Process and Pool types with gem5-specific versions that will improve their usability for our needs.
 With this changeset, these objects are usable, but it will require significant user effort to reach the goal of running/analyzing many different gem5 simulations.
+One specific issue is that the user has to call `m5.setOutputDir` or `Simulator.override_outdir` or else all of the output will overwrite each other.
+We *strongly* recommend that the user use the `multisim` module to run multiple simulations in parallel.
 
-## Example use
+## Multisim
+
+The `multisim` module is a higher-level abstraction that uses the `multiprocessing` module to run multiple simulations in parallel.
+
+You can declare a set of `Simulator` objects (via `add_simulator`) and then run them all in parallel from the command line.
+
+To run all the simulations defined in a script:
+
+```shell
+<gem5-binary> -m gem5.utils.multisim <path-to-script>
+```
+
+To run a specific simulation defined in this script:
+
+```shell
+<gem5-binary> <path-to-script> \
+    process_id_1
+```
+
+To list all the IDs of the simulations defined in this script:
+
+```shell
+<gem5-binary> <path-to-script> -l
+```
+
+## Example use of raw multiprocessing
 
 test.py:
 
diff --git a/src/python/gem5/utils/multiprocessing/_command_line.py b/src/python/gem5/utils/multiprocessing/_command_line.py
index 9922447295..e26428d8f3 100644
--- a/src/python/gem5/utils/multiprocessing/_command_line.py
+++ b/src/python/gem5/utils/multiprocessing/_command_line.py
@@ -68,9 +68,12 @@ def _gem5_args_for_multiprocessing(name):
     # --dot-config, --dot-dvfs-config, --debug-file, --remote-gdb-port, -c
 
     arguments = [
-        f"--outdir={options.outdir}/{name}",
-        f"--stdout-file={options.stdout_file}",
-        f"--stderr-file={options.stderr_file}",
+        # Keep the original outdir. This will be overridden by multisim
+        f"--outdir={options.outdir}",
+        # Update the stdout and stderr names so we can see them.
+        f"--stdout-file={name}_{options.stdout_file}",
+        f"--stderr-file={name}_{options.stderr_file}",
+        # Keep the stats file name. It will be in the new outdir
         f"--stats-file={options.stats_file}",
     ]
     if options.redirect_stdout:
diff --git a/src/python/gem5/utils/multisim/__init__.py b/src/python/gem5/utils/multisim/__init__.py
new file mode 100644
index 0000000000..f9ba9fd59d
--- /dev/null
+++ b/src/python/gem5/utils/multisim/__init__.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from .multisim import (
+    add_simulator,
+    get_simulator_ids,
+    num_simulators,
+    run,
+    set_num_processes,
+)
diff --git a/src/python/gem5/utils/multisim/__main__.py b/src/python/gem5/utils/multisim/__main__.py
new file mode 100644
index 0000000000..db78172aea
--- /dev/null
+++ b/src/python/gem5/utils/multisim/__main__.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+This module is the entry point for the multi-simulation (MultiSim) framework.
+It provides a CLI using argparse to obtain the path to the simulation
+configuration script and the number of processes to run in parallel.
+"""
+from gem5.utils.multisim.multisim import (
+    module_run,
+    run,
+)
+
+
+def main():
+    import argparse
+    from pathlib import Path
+
+    global module_run
+    module_run = True
+
+    parser = argparse.ArgumentParser(
+        description="Pass the config script specifying the simulations to run "
+        "using multisim."
+    )
+    parser.add_argument(
+        "config",
+        type=str,
+        help="The path to the config script specifying the simulations to run using multisim.",
+    )
+
+    args = parser.parse_args()
+    run(module_path=Path(args.config))
+
+
+if __name__ == "__m5_main__":
+    main()
diff --git a/src/python/gem5/utils/multisim/multisim.py b/src/python/gem5/utils/multisim/multisim.py
new file mode 100644
index 0000000000..eb23f618e4
--- /dev/null
+++ b/src/python/gem5/utils/multisim/multisim.py
@@ -0,0 +1,300 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This module contains the gem5 MultiSim framework. The gem5 MultiSim
+work functions by allowing a user to specify multiple simulations to run from
+a single gem5 config script. The MuliSim framework will then run these
+simulations in parallel using the Python multiprocessing library.
+
+The framework works by having the user add the simulators in a configuration
+via the `add_simulator` function, each with a unique, user specified, id. This
+adds the different simulations to run in parallel to a global list. The
+MultiSim framework then uses the Python multiprocessing library to run the
+simulations in parallel by loading the config script as a module in each child
+process and then selecting the simulation to run via the iid in the global set
+of scheduled simulators jobs to run.
+The only difference between the child processes is the id of the simulator.
+
+Important notes
+---------------
+
+1. You cannot load/instantiate the simulators in the main process. You cannot
+even load the config script (i.e., `import config_script`). This means the
+config script is passed as a string referencing the config script as a module.
+This script is then passed to the child processes to load.
+
+2. The config script cannot accept parameters. It must be parameterless.
+"""
+
+import importlib
+import multiprocessing
+from pathlib import Path
+from typing import (
+    Optional,
+    Set,
+)
+
+# A global variable which __main__.py flips to `True` when multisim is run as
+# an executable module.
+module_run = False
+
+# A global variable to store the simulators to run in parallel. If `None`, then
+# `None` is passed to the `multiprocessing.Pool` which instructs
+# multiprocessing to use the maximum number of available threads.
+# threads.
+_num_processes = None
+
+_multi_sim: Set["Simulator"] = set()
+
+
+def _load_module(module_path: Path) -> None:
+    """Load the module at the given path."""
+    spec = importlib.util.spec_from_file_location(
+        "gem5target", str(module_path)
+    )
+    modulevar = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(modulevar)
+
+
+def _get_simulator_ids_child_process(id_list, module_path: Path) -> None:
+    """Get the ids of the simulations to be run.
+
+    This function is passed to the Python multiprocessing module and run with
+    the correct module path in the `_get_simulator_ids` function. This function
+    is run in a child process which loads the module (config script) then reads
+    the IDs.
+
+    Note: We run this as child process as we cannot load the config script as
+    a module in the main process. This function is used in
+    `get_simulator_ids` and should be used separately.
+    """
+
+    _load_module(module_path)
+    global _multi_sim
+    if len(id_list) != 0:
+        id_list *= 0
+    id_list.extend([sim.get_id() for sim in _multi_sim])
+
+
+def _get_num_processes_child_process(
+    num_processes_dict, module_path: Path
+) -> None:
+    """Get the ids of the simulations to be run.
+
+    This function is passed to the Python multiprocessing module and run with
+    the correct module path in the `_get_simulator_ids` function. This function
+    is run in a child process which loads the module (config script) then reads
+    the IDs.
+
+    Note: We run this as child process as we cannot load the config script as
+    a module in the main process. This function is used in
+    `get_simulator_ids` and should be used separately.
+    """
+
+    _load_module(module_path)
+    global _num_processes
+    num_processes_dict["num_processes"] = _num_processes
+
+
+def get_simulator_ids(config_module_path: Path) -> list[str]:
+    """This is a  hack to determine the IDs of the simulations we are to run.
+    The only way we can know is by importing the module, which we can only do
+    in the child processes. We therefore create a child process with the
+    sole purpose of importing the module and returning the IDs via
+    a `multiprocessing.Manager` dictionary.
+
+    This function handles the creation of the `multiprocessing.Manager` and the
+    `multiprocessing.Process`. It then waits for the process to finish to then
+    return the ids as a set of strings.
+    """
+
+    manager = multiprocessing.Manager()
+    id_list = manager.list()
+    p = multiprocessing.Process(
+        target=_get_simulator_ids_child_process,
+        args=(id_list, config_module_path),
+    )
+    p.start()
+    p.join()
+    return id_list
+
+
+def get_num_processes(config_module_path: Path) -> Optional[int]:
+    manager = multiprocessing.Manager()
+    num_processes_dict = manager.dict()
+    p = multiprocessing.Process(
+        target=_get_num_processes_child_process,
+        args=(num_processes_dict, config_module_path),
+    )
+    p.start()
+    p.join()
+    return num_processes_dict["num_processes"]
+
+
+def _run(module_path: Path, id: str) -> None:
+    """Run the simulator with the ID specified."""
+
+    _load_module(module_path)
+
+    global _multi_sim
+    sim_list = [sim for sim in _multi_sim if sim.get_id() == id]
+
+    assert len(sim_list) != 0, f"No simulator with id '{id}' found."
+    assert len(sim_list) == 1, f"Multiple simulators with id '{id}' found."
+    import m5
+
+    subdir = Path(Path(m5.options.outdir) / Path(sim_list[0].get_id()))
+    sim_list[0].override_outdir(subdir)
+
+    sim_list[0].run()
+
+
+def run(module_path: Path, processes: Optional[int] = None) -> None:
+    """Run the simulators specified in the module in parallel.
+
+    :param module_path: The path to the module containing the simulators to
+    run.
+    :param processes: The number of processes to run in parallel. If not
+    specified, the number of available threads will be used.
+    """
+
+    assert len(_multi_sim) == 0, (
+        "Simulators instantiated in main thread instead of child thread "
+        "(prior to determining number of jobs)."
+    )
+
+    # Get the simulator IDs. This both provides us a list of targets
+    # and, by-proxy, the number of jobs.
+    ids = get_simulator_ids(module_path)
+    max_num_processes = get_num_processes(module_path)
+
+    assert len(_multi_sim) == 0, (
+        "Simulators instantiated in main thread instead of child thread "
+        "(after determining number of jobs)."
+    )
+
+    # Setup the multiprocessing pool. If the number of processes is not
+    # specified (i.e. `None`) the default is the number or available threads.
+    from ..multiprocessing.context import gem5Context
+
+    pool = gem5Context().Pool(processes=max_num_processes, maxtasksperchild=1)
+
+    # Use the starmap function to create N child processes each with same
+    # module path (the config script specifying all simulations using MultiSim)
+    # but a different ID. The ID is used to select the correct simulator to
+    # run.
+    pool.starmap(_run, zip([module_path for _ in range(len(ids))], tuple(ids)))
+
+
+def set_num_processes(num_processes: int) -> None:
+    """Set the max number of processes to run in parallel.
+
+    :param num_processes: The number of processes to run in parallel.
+    """
+    if num_processes < 1:
+        raise ValueError("Number of processes must be greater than 0.")
+    if isinstance(num_processes, int):
+        global _num_processes
+        _num_processes = num_processes
+    else:
+        raise ValueError("Number of processes must be an integer.")
+
+
+def num_simulators() -> int:
+    """Returns the number of simulators added to the MultiSim."""
+    return len(_multi_sim)
+
+
+def add_simulator(simulator: "Simulator") -> None:
+    """Add a single simulator to the Multisim. Doing so informs the simulators
+    to run this simulator via multiprocessing.
+
+    **Note:** If this function is not run using the MultiSim module then the
+    user will be prompted to either do so if they desire multiple gem5
+    processes or to pass the id of the simulator to run. This function will
+    attempt to run the simulation with the id passed as an argument. If
+    such simulation exists the simulation will end without failure (or any
+    simulations having been run).
+
+    :param simulator: The simulator to add to the multisim.
+    :param id: The id of the simulator. This is used to reference the
+    simulation. This is particularly important when referencing the correct
+    m5out subdirectory.
+    """
+
+    global _multi_sim
+    if not simulator.get_id():
+        # The default simulator id is the length of the current set of
+        # simulators. This is used to ensure that the simulator has a unique
+        # id.
+        simulator.set_id(f"sim_{len(_multi_sim)}")
+    _multi_sim.add(simulator)
+
+    # The following code is used to enable a user to run a single simulation
+    # from the config script, based on an ID, in the case the config script is
+    # passed a traditional gem5 config and not via the multisim module.
+    global module_run
+    if not module_run:
+        import argparse
+
+        parser = argparse.ArgumentParser(
+            description="Run a specific simulation based on the id."
+        )
+        parser.add_argument(
+            "id",
+            type=str,
+            nargs="?",
+            default=None,
+            help="The id of the simulator to run. "
+            "WARNING: If the id is invalid nothing is done.",
+        )
+        parser.add_argument(
+            "-l",
+            "--list",
+            help="List the ids of the simulators to run.",
+            action="store_true",
+        )
+        args = parser.parse_args()
+        if args.list:
+            print(simulator.get_id())
+        elif not args.id:
+            raise Exception(
+                "If running this script directly as a configuration script "
+                "then a single argument must be specified: the id of the "
+                "simulator to run. This will run the simulation associated "
+                "with that id and no other. If the intent is instead to run "
+                "the script via the MultiSim utility then run this script via "
+                "the multisim module: "
+                "`<gem5> -m gem5.utils.multisim <config_script>`.\n\n"
+                "To list the ids of the simulators to run use the `--list` "
+                "(`-l`)  flag."
+            )
+        elif args.id == simulator.get_id():
+            import m5
+
+            subdir = Path(Path(m5.options.outdir) / Path(simulator.get_id()))
+            simulator.override_outdir(subdir)
+            simulator.run()
diff --git a/src/python/gem5/utils/socks_ssl_context.py b/src/python/gem5/utils/socks_ssl_context.py
new file mode 100644
index 0000000000..3881adb3e1
--- /dev/null
+++ b/src/python/gem5/utils/socks_ssl_context.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import ssl
+from typing import Optional
+
+_gem5_ssl_context = None
+
+
+def get_proxy_context() -> Optional[ssl.SSLContext]:
+    """
+    This function returns a SSL context for https connection with SOCKS proxy
+    support. It uses the environment variable GEM5_USE_PROXY to determine
+    the proxy server to use. If the environment variable is not set, it
+    returns None.
+    """
+
+    global _gem5_ssl_context
+    use_proxy = os.getenv("GEM5_USE_PROXY")
+    if use_proxy and not _gem5_ssl_context:
+        import socket
+
+        import socks
+
+        ip_addr, host_port = use_proxy.split(":")
+        port = int(host_port)
+        socks.set_default_proxy(socks.SOCKS5, ip_addr, port)
+        socket.socket = socks.socksocket
+
+        # base SSL context for https connection
+        _gem5_ssl_context = ssl.create_default_context()
+        _gem5_ssl_context.check_hostname = False
+        _gem5_ssl_context.verify_mode = ssl.CERT_NONE
+    return _gem5_ssl_context
diff --git a/src/python/m5/SimObject.py b/src/python/m5/SimObject.py
index 608fc68492..0da9f7d889 100644
--- a/src/python/m5/SimObject.py
+++ b/src/python/m5/SimObject.py
@@ -1199,6 +1199,21 @@ class SimObject(metaclass=MetaSimObject):
                     param,
                 )
 
+            if (not isinstance(value, EthernetAddr)) and isproxy(value):
+                # At the time of adding this error unproxying params happens
+                # in simulate.py at lines 103-104 (commit hash: f56459470a)
+                # To understand how attributes are handled for SimObjects
+                # refer to SimObject::__setattr__.
+                fatal(
+                    f"Param {param} for {self._name} has value = {value}. "
+                    "This value is a not a valid value. This could be caused "
+                    f"by {param} not having been unproxied correctly. "
+                    "One reason why this might happen is if you have "
+                    "mistakenly added a child SimObject as an attr and not a "
+                    "child by giving it a name that starts with an underscore "
+                    f"`_`. {self.path()} should not say 'orphan.'"
+                )
+
             value = value.getValue()
             if isinstance(self._params[param], VectorParamDesc):
                 assert isinstance(value, list)
diff --git a/src/python/m5/ext/pystats/__init__.py b/src/python/m5/ext/pystats/__init__.py
index 0d15ee1ad1..ce51c77f33 100644
--- a/src/python/m5/ext/pystats/__init__.py
+++ b/src/python/m5/ext/pystats/__init__.py
@@ -25,21 +25,21 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from .abstract_stat import AbstractStat
-from .group import Group
+from .group import (
+    Group,
+    SimObjectGroup,
+    SimObjectVectorGroup,
+)
 from .jsonloader import JsonLoader
 from .serializable_stat import SerializableStat
 from .simstat import SimStat
-from .statistic import Statistic
+from .statistic import (
+    Distribution,
+    Scalar,
+    SparseHist,
+    Statistic,
+    Vector,
+    Vector2d,
+)
 from .storagetype import StorageType
 from .timeconversion import TimeConversion
-
-__all__ = [
-    "AbstractStat",
-    "Group",
-    "SimStat",
-    "Statistic",
-    "TimeConversion",
-    "StorageType",
-    "SerializableStat",
-    "JsonLoader",
-]
diff --git a/src/python/m5/ext/pystats/abstract_stat.py b/src/python/m5/ext/pystats/abstract_stat.py
index bae327fcf9..98adda314b 100644
--- a/src/python/m5/ext/pystats/abstract_stat.py
+++ b/src/python/m5/ext/pystats/abstract_stat.py
@@ -26,10 +26,12 @@
 
 import re
 from typing import (
+    Any,
     Callable,
     List,
     Optional,
     Pattern,
+    Tuple,
     Union,
 )
 
@@ -60,20 +62,11 @@ class AbstractStat(SerializableStat):
                           If it returns ``True``, then the child is yielded.
                           Otherwise, the child is skipped. If not provided then
                           all children are returned.
+
+        Note: This is method must be implemented in AbstractStat subclasses
+        which have children, otherwise it will return an empty list.
         """
-
-        to_return = []
-        for attr in self.__dict__:
-            obj = getattr(self, attr)
-            if isinstance(obj, AbstractStat):
-                if (predicate and predicate(attr)) or not predicate:
-                    to_return.append(obj)
-                if recursive:
-                    to_return = to_return + obj.children(
-                        predicate=predicate, recursive=True
-                    )
-
-        return to_return
+        return []
 
     def find(self, regex: Union[str, Pattern]) -> List["AbstractStat"]:
         """Find all stats that match the name, recursively through all the
@@ -99,3 +92,52 @@ class AbstractStat(SerializableStat):
         return self.children(
             lambda _name: re.match(pattern, _name), recursive=True
         )
+
+    def _get_vector_item(self, item: str) -> Optional[Tuple[str, int, Any]]:
+        """It has been the case in gem5 that SimObject vectors are stored as
+        strings such as "cpu0" or "cpu1". This function splits the string into
+        the SimObject name and index, (e.g.: ["cpu", 0] and ["cpu", 1]) and
+        returns the item for that name and it's index. If the string cannot be
+        split into a SimObject name and index, or if the SimObject does not
+        exit at `Simobject[index]`, the function returns None.
+        """
+        regex = re.compile("[0-9]+$")
+        match = regex.search(item)
+        if not match:
+            return None
+
+        match_str = match.group()
+
+        assert match_str.isdigit(), f"Regex match must be a digit: {match_str}"
+        vector_index = int(match_str)
+        vector_name = item[: (-1 * len(match_str))]
+
+        if hasattr(self, vector_name):
+            vector = getattr(self, vector_name)
+            try:
+                vector_value = vector[vector_index]
+                return vector_name, vector_index, vector_value
+            except KeyError:
+                pass
+        return None
+
+    def __iter__(self):
+        return iter(self.__dict__)
+
+    def __getattr__(self, item: str) -> Any:
+        vector_item = self._get_vector_item(item)
+        if not vector_item:
+            return None
+
+        assert (
+            len(vector_item) == 3
+        ), f"Vector item must have 3 elements: {vector_item}"
+        return vector_item[2]
+
+    def __getitem__(self, item: str):
+        return getattr(self, item)
+
+    def __contains__(self, item: Any) -> bool:
+        return (
+            isinstance(item, str) and self._get_vector_item(item)
+        ) or hasattr(self, item)
diff --git a/src/python/m5/ext/pystats/group.py b/src/python/m5/ext/pystats/group.py
index bc292c9d0b..d1808221e7 100644
--- a/src/python/m5/ext/pystats/group.py
+++ b/src/python/m5/ext/pystats/group.py
@@ -25,18 +25,16 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from typing import (
+    Any,
+    Callable,
     Dict,
     List,
-    Mapping,
     Optional,
     Union,
 )
 
 from .abstract_stat import AbstractStat
-from .statistic import (
-    Scalar,
-    Statistic,
-)
+from .statistic import Statistic
 from .timeconversion import TimeConversion
 
 
@@ -57,9 +55,7 @@ class Group(AbstractStat):
             str, Union["Group", Statistic, List["Group"], List["Statistic"]]
         ],
     ):
-        if type is None:
-            self.type = "Group"
-        else:
+        if type:
             self.type = type
 
         self.time_conversion = time_conversion
@@ -67,18 +63,70 @@ class Group(AbstractStat):
         for key, value in kwargs.items():
             setattr(self, key, value)
 
+    def children(
+        self,
+        predicate: Optional[Callable[[str], bool]] = None,
+        recursive: bool = False,
+    ) -> List["AbstractStat"]:
+        to_return = []
+        for attr in self.__dict__:
+            obj = getattr(self, attr)
+            if isinstance(obj, AbstractStat):
+                if (predicate and predicate(attr)) or not predicate:
+                    to_return.append(obj)
+                if recursive:
+                    to_return = to_return + obj.children(
+                        predicate=predicate, recursive=True
+                    )
+        return to_return
 
-class Vector(Group):
-    """
-    The Vector class is used to store vector information. However, in gem5
-    Vectors, in practise, hold information that is more like a dictionary of
-    Scalar Values. This class may change, and may be merged into Group in
-    accordance to decisions made in relation to
-    https://gem5.atlassian.net/browse/GEM5-867.
+
+class SimObjectGroup(Group):
+    """A group of statistics encapulated within a SimObject."""
+
+    def __init__(self, **kwargs: Dict[str, Union[Group, Statistic]]):
+        super().__init__(type="SimObject", **kwargs)
+
+
+class SimObjectVectorGroup(Group):
+    """A Vector of SimObject objects. I.e., that which would be constructed
+    from something like `system.cpu = [DerivO3CPU(), TimingSimpleCPU()]`.
     """
 
-    def __init__(self, scalar_map: Mapping[str, Scalar]):
-        super().__init__(type="Vector", time_conversion=None, **scalar_map)
+    def __init__(self, value: List[AbstractStat], **kwargs: Dict[str, Any]):
+        assert isinstance(value, list), "Value must be a list"
+        super().__init__(type="SimObjectVector", value=value, **kwargs)
 
-    def _repr_name(self) -> str:
-        return "Vector"
+    def __getitem__(self, index: Union[int, str, float]) -> AbstractStat:
+        if not isinstance(index, int):
+            raise KeyError(
+                f"Index {index} not found in int. Cannot index Array with "
+                "non-int"
+            )
+        return self.value[index]
+
+    def __iter__(self):
+        return iter(self.value)
+
+    def __len__(self):
+        return len(self.value)
+
+    def __getitem__(self, item: int):
+        return self.value[item]
+
+    def __contains__(self, item):
+        if isinstance(item, int):
+            return item >= 0 and item < len(self)
+
+    def children(
+        self,
+        predicate: Optional[Callable[[str], bool]] = None,
+        recursive: bool = False,
+    ) -> List["AbstractStat"]:
+        to_return = []
+        for child in self.value:
+            to_return = to_return + child.children(
+                predicate=predicate, recursive=recursive
+            )
+
+        return to_return
diff --git a/src/python/m5/ext/pystats/jsonloader.py b/src/python/m5/ext/pystats/jsonloader.py
index 260b978174..6c0f07585f 100644
--- a/src/python/m5/ext/pystats/jsonloader.py
+++ b/src/python/m5/ext/pystats/jsonloader.py
@@ -31,13 +31,9 @@ from typing import (
     Union,
 )
 
-from .group import (
-    Group,
-    Vector,
-)
+from .group import Group
 from .simstat import SimStat
 from .statistic import (
-    Accumulator,
     Distribution,
     Scalar,
     Statistic,
@@ -74,10 +70,6 @@ class JsonLoader(json.JSONDecoder):
                 d.pop("type", None)
                 return Distribution(**d)
 
-            elif d["type"] == "Accumulator":
-                d.pop("type", None)
-                return Accumulator(**d)
-
             elif d["type"] == "Group":
                 return Group(**d)
 
diff --git a/src/python/m5/ext/pystats/serializable_stat.py b/src/python/m5/ext/pystats/serializable_stat.py
index faa70a5022..e4eb861d3f 100644
--- a/src/python/m5/ext/pystats/serializable_stat.py
+++ b/src/python/m5/ext/pystats/serializable_stat.py
@@ -85,6 +85,11 @@ class SerializableStat:
             return value
         elif isinstance(value, datetime):
             return value.replace(microsecond=0).isoformat()
+        elif isinstance(value, Dict):
+            d = {}
+            for k, v in value.items():
+                d[self.__process_json_value(k)] = self.__process_json_value(v)
+            return d
         elif isinstance(value, list):
             return [self.__process_json_value(v) for v in value]
         elif isinstance(value, StorageType):
diff --git a/src/python/m5/ext/pystats/simstat.py b/src/python/m5/ext/pystats/simstat.py
index 06858bb9ad..ec8b68623e 100644
--- a/src/python/m5/ext/pystats/simstat.py
+++ b/src/python/m5/ext/pystats/simstat.py
@@ -32,22 +32,16 @@ from typing import (
     Union,
 )
 
-from .abstract_stat import AbstractStat
 from .group import Group
 from .statistic import Statistic
 from .timeconversion import TimeConversion
 
 
-class SimStat(AbstractStat):
+class SimStat(Group):
     """
     Contains all the statistics for a given simulation.
     """
 
-    creation_time: Optional[datetime]
-    time_conversion: Optional[TimeConversion]
-    simulated_begin_time: Optional[Union[int, float]]
-    simulated_end_time: Optional[Union[int, float]]
-
     def __init__(
         self,
         creation_time: Optional[datetime] = None,
@@ -56,10 +50,10 @@ class SimStat(AbstractStat):
         simulated_end_time: Optional[Union[int, float]] = None,
         **kwargs: Dict[str, Union[Group, Statistic, List[Group]]]
     ):
-        self.creation_time = creation_time
-        self.time_conversion = time_conversion
-        self.simulated_begin_time = simulated_begin_time
-        self.simulated_end_time = simulated_end_time
-
-        for key, value in kwargs.items():
-            setattr(self, key, value)
+        super().__init__(
+            creation_time=creation_time,
+            time_conversion=time_conversion,
+            simulated_begin_time=simulated_begin_time,
+            simulated_end_time=simulated_end_time,
+            **kwargs
+        )
diff --git a/src/python/m5/ext/pystats/statistic.py b/src/python/m5/ext/pystats/statistic.py
index 3d4c0c6246..fb98ceb93e 100644
--- a/src/python/m5/ext/pystats/statistic.py
+++ b/src/python/m5/ext/pystats/statistic.py
@@ -27,6 +27,8 @@
 from abc import ABC
 from typing import (
     Any,
+    Callable,
+    Dict,
     Iterable,
     List,
     Optional,
@@ -44,23 +46,17 @@ class Statistic(ABC, AbstractStat):
 
     value: Any
     type: Optional[str]
-    unit: Optional[str]
     description: Optional[str]
-    datatype: Optional[StorageType]
 
     def __init__(
         self,
         value: Any,
         type: Optional[str] = None,
-        unit: Optional[str] = None,
         description: Optional[str] = None,
-        datatype: Optional[StorageType] = None,
     ):
         self.value = value
         self.type = type
-        self.unit = unit
         self.description = description
-        self.datatype = datatype
 
     def __repr__(self):
         return str(self.value)
@@ -72,10 +68,12 @@ class Scalar(Statistic):
     """
 
     value: Union[float, int]
+    unit: Optional[str]
+    datatype: Optional[StorageType]
 
     def __init__(
         self,
-        value: Any,
+        value: Union[float, int],
         unit: Optional[str] = None,
         description: Optional[str] = None,
         datatype: Optional[StorageType] = None,
@@ -83,59 +81,198 @@ class Scalar(Statistic):
         super().__init__(
             value=value,
             type="Scalar",
-            unit=unit,
             description=description,
-            datatype=datatype,
         )
+        self.unit = unit
+        self.datatype = datatype
 
 
-class BaseScalarVector(Statistic):
+class Vector(Statistic):
     """
-    An abstract base class for classes containing a vector of Scalar values.
+    An Python statistics which representing a vector of Scalar values.
     """
 
-    value: List[Union[int, float]]
-
     def __init__(
         self,
-        value: Iterable[Union[int, float]],
+        value: Dict[Union[str, int, float], Scalar],
         type: Optional[str] = None,
-        unit: Optional[str] = None,
         description: Optional[str] = None,
-        datatype: Optional[StorageType] = None,
     ):
         super().__init__(
-            value=list(value),
+            value=value,
             type=type,
-            unit=unit,
             description=description,
-            datatype=datatype,
         )
 
+    def __getitem__(self, item: Union[int, str, float]) -> Scalar:
+        assert self.value != None
+        # In the case of string, we cast strings to integers of floats if they
+        # are numeric. This avoids users having to cast strings to integers.
+        if isinstance(item, str):
+            if item.isdigit():
+                item = int(item)
+            elif item.isnumeric():
+                item = float(item)
+        return self.value[item]
+
+    def __contains__(self, item) -> bool:
+        assert self.value != None
+        if isinstance(item, str):
+            if item.isdigit():
+                item = int(item)
+            elif item.isnumeric():
+                item = float(item)
+        return item in self.value
+
+    def __iner__(self) -> None:
+        return iter(self.value)
+
+    def __len__(self) -> int:
+        assert self.value != None
+        return len(self.value.values())
+
+    def size(self) -> int:
+        """
+        Returns the size of the vector.
+
+        :returns: The size of the vector.
+        """
+        assert self.value != None
+        return len(self.value)
+
     def mean(self) -> float:
         """
         Returns the mean of the value vector.
 
-        :returns: The mean value across all bins.
+        :returns: The mean value across all values in the vector.
         """
         assert self.value != None
-        assert isinstance(self.value, List)
 
-        from statistics import mean as statistics_mean
-
-        return statistics_mean(self.value)
+        return self.count() / self.size()
 
     def count(self) -> float:
         """
-        Returns the count across all the bins.
+        Returns the count (sum) of all values in the vector.
 
-        :returns: The sum of all bin values.
+        :returns: The sum of all vector values.
         """
         assert self.value != None
-        return sum(self.value)
+        return sum(float(self.value[key]) for key in self.values)
+
+    def children(
+        self,
+        predicate: Optional[Callable[[str], bool]] = None,
+        recursive: bool = False,
+    ) -> List["AbstractStat"]:
+        to_return = []
+        for attr in self.value.keys():
+            obj = self.value[attr]
+            if isinstance(obj, AbstractStat):
+                if (
+                    isinstance(attr, str)
+                    and (predicate and predicate(attr))
+                    or not predicate
+                ):
+                    to_return.append(obj)
+                to_return = to_return + obj.children(
+                    predicate=predicate, recursive=True
+                )
+        return to_return
 
 
-class Distribution(BaseScalarVector):
+class Vector2d(Statistic):
+    """
+    A 2D vector of scalar values.
+    """
+
+    value: Dict[Union[str, int, float], Vector]
+
+    def __init__(
+        self,
+        value: Dict[Union[str, int, float], Vector],
+        type: Optional[str] = None,
+        description: Optional[str] = None,
+    ):
+        assert (
+            len({vector.size() for vector in value.values()}) == 1
+        ), "All the Vectors in the 2d Vector are not of equal length."
+
+        super().__init__(
+            value=value,
+            type=type,
+            description=description,
+        )
+
+    def x_size(self) -> int:
+        """Returns the number of elements in the x dimension."""
+        assert self.value is not None
+        return len(self.value)
+
+    def y_size(self) -> int:
+        """Returns the number of elements in the y dimension."""
+        assert self.value is not None
+        return len(self.value[0])
+
+    def size(self) -> int:
+        """Returns the total number of elements."""
+        return self.x_size() * self.y_size()
+
+    def __len__(self) -> int:
+        return self.x_size()
+
+    def __iter__(self):
+        return iter(self.keys())
+
+    def total(self) -> int:
+        """The total (sum) of all the entries in the 2d vector/"""
+        assert self.value is not None
+        total = 0
+        for vector in self.value.values():
+            for scalar in vector.values():
+                total += scalar.value
+        return total
+
+    def __getitem__(self, index: Union[str, int, float]) -> Vector:
+        assert self.value is not None
+        # In the case of string, we cast strings to integers of floats if they
+        # are numeric. This avoids users having to cast strings to integers.
+        if isinstance(index, str):
+            if index.isindex():
+                index = int(index)
+            elif index.isnumeric():
+                index = float(index)
+        return self.value[index]
+
+    def children(
+        self,
+        predicate: Optional[Callable[[str], bool]] = None,
+        recursive: bool = False,
+    ) -> List["AbstractStat"]:
+        to_return = []
+        for attr in self.value.keys():
+            obj = self.value[attr]
+            if (
+                isinstance(attr, str)
+                and (predicate and predicate(attr))
+                or not predicate
+            ):
+                to_return.append(obj)
+            to_return = to_return + obj.children(
+                predicate=predicate, recursive=True
+            )
+        return to_return
+
+    def __contains__(self, item) -> bool:
+        assert self.value is not None
+        if isinstance(item, str):
+            if item.isdigit():
+                item = int(item)
+            elif item.isnumeric():
+                item = float(item)
+        return item in self.value
+
+
+class Distribution(Vector):
     """
     A statistic type that stores information relating to distributions. Each
     distribution has a number of bins (>=1)
@@ -157,7 +294,7 @@ class Distribution(BaseScalarVector):
 
     def __init__(
         self,
-        value: Iterable[int],
+        value: Dict[Union[int, float], Scalar],
         min: Union[float, int],
         max: Union[float, int],
         num_bins: int,
@@ -167,16 +304,12 @@ class Distribution(BaseScalarVector):
         underflow: Optional[int] = None,
         overflow: Optional[int] = None,
         logs: Optional[float] = None,
-        unit: Optional[str] = None,
         description: Optional[str] = None,
-        datatype: Optional[StorageType] = None,
     ):
         super().__init__(
             value=value,
             type="Distribution",
-            unit=unit,
             description=description,
-            datatype=datatype,
         )
 
         self.min = min
@@ -194,39 +327,29 @@ class Distribution(BaseScalarVector):
         assert self.num_bins >= 1
 
 
-class Accumulator(BaseScalarVector):
-    """
-    A statistical type representing an accumulator.
-    """
-
-    _count: int
-    min: Union[int, float]
-    max: Union[int, float]
-    sum_squared: Optional[int]
+class SparseHist(Vector):
+    """A Sparse Histogram of values. A sparse histogram simply counts the "
+    frequency of each value in a sample. Ergo, it is, ineffect an disctionary
+    of values mapped to their count"""
 
     def __init__(
         self,
-        value: Iterable[Union[int, float]],
-        count: int,
-        min: Union[int, float],
-        max: Union[int, float],
-        sum_squared: Optional[int] = None,
-        unit: Optional[str] = None,
+        value: Dict[float, Scalar],
         description: Optional[str] = None,
-        datatype: Optional[StorageType] = None,
     ):
         super().__init__(
             value=value,
-            type="Accumulator",
-            unit=unit,
+            type="SparseHist",
             description=description,
-            datatype=datatype,
         )
 
-        self._count = count
-        self.min = min
-        self.max = max
-        self.sum_squared = sum_squared
+    def size(self) -> int:
+        """The number of unique sampled values."""
+        return len(self.value)
 
     def count(self) -> int:
-        return self._count
+        """
+        Returns the total number of samples.
+        """
+        assert self.value != None
+        return sum(self.value.values())
diff --git a/src/python/m5/params.py b/src/python/m5/params.py
index 32723a7860..f3a68ccc18 100644
--- a/src/python/m5/params.py
+++ b/src/python/m5/params.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012-2014, 2017-2019, 2021 Arm Limited
+# Copyright (c) 2012-2014, 2017-2019, 2021, 2024 Arm Limited
 # All rights reserved.
 #
 # The license below extends only to copyright in the software and shall
@@ -1036,6 +1036,9 @@ class AddrRange(ParamValue):
 
         return list([AddrRange(r.start(), r.end()) for r in pybind_include])
 
+    def is_subset(self, addr_range):
+        return self.getValue().isSubset(addr_range.getValue())
+
 
 # Boolean parameter type.  Python doesn't let you subclass bool, since
 # it doesn't want to let you create multiple instances of True and
diff --git a/src/python/m5/stats/gem5stats.py b/src/python/m5/stats/gem5stats.py
index 682149f380..e6185d4a94 100644
--- a/src/python/m5/stats/gem5stats.py
+++ b/src/python/m5/stats/gem5stats.py
@@ -41,6 +41,7 @@ from m5.ext.pystats.simstat import *
 from m5.ext.pystats.statistic import *
 from m5.ext.pystats.storagetype import *
 from m5.objects import *
+from m5.params import SimObjectVector
 
 import _m5.stats
 
@@ -83,33 +84,6 @@ class JsonOutputVistor:
             simstat.dump(fp=fp, **self.json_args)
 
 
-def get_stats_group(group: _m5.stats.Group) -> Group:
-    """
-    Translates a gem5 Group object into a Python stats Group object. A Python
-    statistic Group object is a dictionary of labeled Statistic objects. Any
-    gem5 object passed to this will have its ``getStats()`` and ``getStatGroups``
-    function called, and all the stats translated (inclusive of the stats
-    further down the hierarchy).
-
-    :param group: The gem5 _m5.stats.Group object to be translated to be a Python
-                  stats Group object. Typically this will be a gem5 SimObject.
-
-    :returns: The stats group object translated from the input gem5 object.
-    """
-
-    stats_dict = {}
-
-    for stat in group.getStats():
-        statistic = __get_statistic(stat)
-        if statistic is not None:
-            stats_dict[stat.name] = statistic
-
-    for key in group.getStatGroups():
-        stats_dict[key] = get_stats_group(group.getStatGroups()[key])
-
-    return Group(**stats_dict)
-
-
 def __get_statistic(statistic: _m5.stats.Info) -> Optional[Statistic]:
     """
     Translates a _m5.stats.Info object into a Statistic object, to process
@@ -118,13 +92,17 @@ def __get_statistic(statistic: _m5.stats.Info) -> Optional[Statistic]:
     :param statistic: The Info object to be translated to a Statistic object.
 
     :returns: The Statistic object of the Info object. Returns ``None`` if
-              Info object cannot be translated.
+              Info object cannot, or should not, be translated.
     """
 
     assert isinstance(statistic, _m5.stats.Info)
     statistic.prepare()
 
     if isinstance(statistic, _m5.stats.ScalarInfo):
+        if statistic.is_nozero and statistic.value == 0.0:
+            # In the case where the "nozero" flag is set, and the value is
+            # zero, we don't want to include this statistic so return None.
+            return None
         return __get_scaler(statistic)
     elif isinstance(statistic, _m5.stats.DistInfo):
         return __get_distribution(statistic)
@@ -134,6 +112,10 @@ def __get_statistic(statistic: _m5.stats.Info) -> Optional[Statistic]:
         pass
     elif isinstance(statistic, _m5.stats.VectorInfo):
         return __get_vector(statistic)
+    elif isinstance(statistic, _m5.stats.Vector2dInfo):
+        return __get_vector2d(statistic)
+    elif isinstance(statistic, _m5.stats.SparseHistInfo):
+        return __get_sparse_hist(statistic)
 
     return None
 
@@ -151,7 +133,6 @@ def __get_scaler(statistic: _m5.stats.ScalarInfo) -> Scalar:
 
 
 def __get_distribution(statistic: _m5.stats.DistInfo) -> Distribution:
-    unit = statistic.unit
     description = statistic.desc
     value = statistic.values
     bin_size = statistic.bucket_size
@@ -163,11 +144,17 @@ def __get_distribution(statistic: _m5.stats.DistInfo) -> Distribution:
     underflow = statistic.underflow
     overflow = statistic.overflow
     logs = statistic.logs
-    # DistInfo uses the C++ `double`.
-    datatype = StorageType["f64"]
+
+    parsed_values = {}
+    for index in range(len(value)):
+        parsed_values[index] = Scalar(
+            value=value[index],
+            unit=statistic.unit,
+            datatype=StorageType["f64"],
+        )
 
     return Distribution(
-        value=value,
+        value=parsed_values,
         min=min,
         max=max,
         num_bins=num_bins,
@@ -177,36 +164,102 @@ def __get_distribution(statistic: _m5.stats.DistInfo) -> Distribution:
         underflow=underflow,
         overflow=overflow,
         logs=logs,
-        unit=unit,
         description=description,
-        datatype=datatype,
     )
 
 
 def __get_vector(statistic: _m5.stats.VectorInfo) -> Vector:
-    to_add = dict()
+    vec: Dict[Union[str, int, float], Scalar] = {}
 
     for index in range(statistic.size):
         # All the values in a Vector are Scalar values
         value = statistic.value[index]
-        unit = statistic.unit
-        description = statistic.subdescs[index]
-        # ScalarInfo uses the C++ `double`.
-        datatype = StorageType["f64"]
+        assert isinstance(value, float) or isinstance(value, int)
 
         # Sometimes elements within a vector are defined by their name. Other
         # times they have no name. When a name is not available, we name the
         # stat the index value.
-        if str(statistic.subnames[index]):
-            index_string = str(statistic.subnames[index])
+        if len(statistic.subnames) > index and statistic.subnames[index]:
+            index_subname = str(statistic.subnames[index])
+            if index_subname.isdigit():
+                index_subname = int(index_subname)
+            elif index_subname.isnumeric():
+                index_subname = float(index_subname)
         else:
-            index_string = str(index)
+            index_subname = index
 
-        to_add[index_string] = Scalar(
-            value=value, unit=unit, description=description, datatype=datatype
+        index_subdesc = None
+        if len(statistic.subdescs) > index and statistic.subdescs[index]:
+            index_subdesc = str(statistic.subdescs[index])
+        else:
+            index_subdesc = statistic.desc
+
+        vec[index_subname] = Scalar(
+            value=value,
+            unit=statistic.unit,
+            description=index_subdesc,
+            datatype=StorageType["f64"],
         )
 
-    return Vector(scalar_map=to_add)
+    return Vector(
+        vec,
+        type="Vector",
+        description=statistic.desc,
+    )
+
+
+def __get_vector2d(statistic: _m5.stats.Vector2dInfo) -> Vector2d:
+    # All the values in a 2D Vector are Scalar values
+    description = statistic.desc
+    x_size = statistic.x_size
+    y_size = statistic.y_size
+
+    vector_rep: Dict[Union[str, int, float], Vector] = {}
+    for x_index in range(x_size):
+        x_index_string = x_index
+        if x_index in statistic.subnames:
+            x_index_string = str(statistic.subnames[x_index])
+
+        x_desc = description
+        if x_index in statistic.subdescs:
+            x_desc = str(statistic.subdescs[x_index])
+        x_vec: Dict[str, Scalar] = {}
+        for y_index in range(y_size):
+            y_index_val = y_index
+            if y_index in statistic.ysubnames:
+                y_index_val = str(statistic.subnames[y_index])
+
+            x_vec[y_index_val] = Scalar(
+                value=statistic.value[x_index * y_size + y_index],
+                unit=statistic.unit,
+                datatype=StorageType["f64"],
+            )
+
+        vector_rep[x_index_string] = Vector(
+            x_vec,
+            type="Vector",
+            description=x_desc,
+        )
+
+    return Vector2d(value=vector_rep, type="Vector2d", description=description)
+
+
+def __get_sparse_hist(statistic: _m5.stats.SparseHistInfo) -> SparseHist:
+    description = statistic.desc
+    value = statistic.values
+
+    parsed_values = {}
+    for val in value:
+        parsed_values[val] = Scalar(
+            value=value[val],
+            unit=statistic.unit,
+            datatype=StorageType["f64"],
+        )
+
+    return SparseHist(
+        value=parsed_values,
+        description=description,
+    )
 
 
 def _prepare_stats(group: _m5.stats.Group):
@@ -223,8 +276,84 @@ def _prepare_stats(group: _m5.stats.Group):
         _prepare_stats(child)
 
 
+def _process_simobject_object(simobject: SimObject) -> SimObjectGroup:
+    """
+    Processes the stats of a SimObject, and returns a dictionary of the stats
+    for the SimObject with PyStats objects when appropriate.
+
+    :param simobject: The SimObject to process the stats for.
+
+    :returns: A dictionary of the PyStats stats for the SimObject.
+    """
+
+    assert isinstance(
+        simobject, SimObject
+    ), "simobject param must be a SimObject."
+
+    stats = (
+        {
+            "name": simobject.get_name(),
+        }
+        if simobject.get_name()
+        else {}
+    )
+
+    for stat in simobject.getStats():
+        val = __get_statistic(stat)
+        if val:
+            stats[stat.name] = val
+
+    for name, child in simobject._children.items():
+        to_add = _process_simobject_stats(child)
+        if to_add:
+            stats[name] = to_add
+
+    for name, child in sorted(simobject.getStatGroups().items()):
+        # Note: We are using the name of the group to determine if we have
+        # already processed the group as a child simobject or a statistic.
+        # This is to avoid SimObjectVector's being processed twice. It is far
+        # from an ideal solution, but it works for now.
+        if not any(
+            re.compile(f"{to_match}" + r"\d*").search(name)
+            for to_match in stats.keys()
+        ):
+            stats[name] = Group(**_process_simobject_stats(child))
+
+    return SimObjectGroup(**stats)
+
+
+def _process_simobject_stats(
+    simobject: Union[
+        SimObject, SimObjectVector, List[Union[SimObject, SimObjectVector]]
+    ]
+) -> Union[List[Dict], Dict]:
+    """
+    Processes the stats of a SimObject, SimObjectVector, or List of either, and
+    returns a dictionary of the PySqtats for the SimObject.
+
+    :param simobject: The SimObject to process the stats for.
+
+    :returns: A dictionary of the stats for the SimObject.
+    """
+
+    if isinstance(simobject, SimObject):
+        return _process_simobject_object(simobject)
+
+    if isinstance(simobject, Union[List, SimObjectVector]):
+        stats_list = []
+        for obj in simobject:
+            stats_list.append(_process_simobject_stats(obj))
+        return SimObjectVectorGroup(value=stats_list)
+
+    return {}
+
+
 def get_simstat(
-    root: Union[SimObject, List[SimObject]], prepare_stats: bool = True
+    root: Union[
+        Union[SimObject, SimObjectVector],
+        List[Union[SimObject, SimObjectVector]],
+    ],
+    prepare_stats: bool = True,
 ) -> SimStat:
     """
     This function will return the SimStat object for a simulation given a
@@ -244,7 +373,26 @@ def get_simstat(
     :Returns: The SimStat Object of the current simulation.
 
     """
+
+    if prepare_stats:
+        _m5.stats.processDumpQueue()
+
     stats_map = {}
+    for r in root:
+        if prepare_stats:
+            if isinstance(r, list):
+                for obj in r:
+                    _prepare_stats(obj)
+            else:
+                _prepare_stats(r)
+
+        stats = _process_simobject_stats(r).__dict__
+        stats["name"] = r.get_name() if r.get_name() else "root"
+        stats_map[stats["name"]] = stats
+
+    if len(stats_map) == 1:
+        stats_map = stats_map[next(iter(stats_map))]
+
     creation_time = datetime.now()
     time_converstion = None  # TODO https://gem5.atlassian.net/browse/GEM5-846
     final_tick = Root.getInstance().resolveStat("finalTick").value
@@ -252,31 +400,8 @@ def get_simstat(
     simulated_begin_time = int(final_tick - sim_ticks)
     simulated_end_time = int(final_tick)
 
-    if prepare_stats:
-        _m5.stats.processDumpQueue()
-
-    for r in root:
-        if isinstance(r, Root):
-            # The Root is a special case, we jump directly into adding its
-            # constituent Groups.
-            if prepare_stats:
-                _prepare_stats(r)
-            for key in r.getStatGroups():
-                stats_map[key] = get_stats_group(r.getStatGroups()[key])
-        elif isinstance(r, SimObject):
-            if prepare_stats:
-                _prepare_stats(r)
-            stats_map[r.get_name()] = get_stats_group(r)
-        else:
-            raise TypeError(
-                "Object (" + str(r) + ") passed is not a "
-                "SimObject. " + __name__ + " only processes "
-                "SimObjects, or a list of  SimObjects."
-            )
-
     return SimStat(
         creation_time=creation_time,
-        time_conversion=time_converstion,
         simulated_begin_time=simulated_begin_time,
         simulated_end_time=simulated_end_time,
         **stats_map,
diff --git a/src/python/pybind11/stats.cc b/src/python/pybind11/stats.cc
index 266f47e52a..5083d625bf 100644
--- a/src/python/pybind11/stats.cc
+++ b/src/python/pybind11/stats.cc
@@ -76,7 +76,9 @@ cast_stat_info(const statistics::Info *info)
      */
     TRY_CAST(statistics::FormulaInfo);
     TRY_CAST(statistics::VectorInfo);
+    TRY_CAST(statistics::Vector2dInfo);
     TRY_CAST(statistics::DistInfo);
+    TRY_CAST(statistics::SparseHistInfo);
 
     return py::cast(info);
 
@@ -145,6 +147,9 @@ pybind_init_stats(py::module_ &m_native)
         .def_property_readonly("flags", [](const statistics::Info &info) {
                 return (statistics::FlagsType)info.flags;
             })
+        .def_property_readonly("is_nozero", [](const statistics::Info &info) {
+                return info.flags.isSet(statistics::nozero);
+            })
         .def("check", &statistics::Info::check)
         .def("baseCheck", &statistics::Info::baseCheck)
         .def("enable", &statistics::Info::enable)
@@ -180,6 +185,26 @@ pybind_init_stats(py::module_ &m_native)
             [](const statistics::VectorInfo &info) { return info.total(); })
         ;
 
+    py::class_<statistics::Vector2dInfo, statistics::Info,
+               std::unique_ptr<statistics::Vector2dInfo, py::nodelete>>(
+                    m, "Vector2dInfo")
+        .def_readonly("x_size", &statistics::Vector2dInfo::x)
+        .def_readonly("y_size", &statistics::Vector2dInfo::y)
+        .def_readonly("subnames", &statistics::Vector2dInfo::subnames)
+        .def_readonly("subdescs", &statistics::Vector2dInfo::subdescs)
+        .def_readonly("ysubnames", &statistics::Vector2dInfo::y_subnames)
+        .def_readonly("value", &statistics::Vector2dInfo::cvec)
+        ;
+
+    py::class_<statistics::SparseHistInfo, statistics::Info,
+               std::unique_ptr<statistics::SparseHistInfo, py::nodelete>>(
+                    m, "SparseHistInfo")
+        .def_property_readonly("values", //A Dict[float, int] of sample & count
+            [](const statistics::SparseHistInfo &info) {
+                return info.data.cmap;
+            })
+        ;
+
     py::class_<statistics::FormulaInfo, statistics::VectorInfo,
                std::unique_ptr<statistics::FormulaInfo, py::nodelete>>(
                       m, "FormulaInfo")
diff --git a/src/sim/Workload.py b/src/sim/Workload.py
index 50625e8085..3e0c9a6e85 100644
--- a/src/sim/Workload.py
+++ b/src/sim/Workload.py
@@ -147,6 +147,12 @@ class SEWorkload(Workload, metaclass=SEWorkloadMeta):
         from _m5 import object_file
 
         obj = object_file.create(path)
+
+        if obj is None:
+            raise ValueError(
+                f"{path}: the file is not an object file compatible with gem5."
+            )
+
         options = list(
             filter(
                 lambda wld: wld._is_compatible_with(obj),
diff --git a/src/sim/guest_abi/layout.hh b/src/sim/guest_abi/layout.hh
index 4d469b177c..9873a61215 100644
--- a/src/sim/guest_abi/layout.hh
+++ b/src/sim/guest_abi/layout.hh
@@ -169,7 +169,7 @@ template <typename ABI, typename Arg>
 static Arg
 getArgument(ThreadContext *tc, typename ABI::State &state)
 {
-    return Argument<ABI, Arg>::get(tc, state);
+    return (Arg)Argument<ABI, Arg>::get(tc, state);
 }
 
 } // namespace guest_abi
diff --git a/src/sim/insttracer.hh b/src/sim/insttracer.hh
index 37e29756a2..1a4b3119ae 100644
--- a/src/sim/insttracer.hh
+++ b/src/sim/insttracer.hh
@@ -232,14 +232,38 @@ class InstRecord
     setData(const RegClass &reg_class, RegVal val)
     {
         new(&data.asReg) InstResult(reg_class, val);
-        dataStatus = DataReg;
+        switch (reg_class.type()) {
+            case IntRegClass:
+            case MiscRegClass:
+            case CCRegClass:
+                dataStatus = DataInt64;
+                break;
+            case FloatRegClass:
+                dataStatus = DataDouble;
+                break;
+            default:
+                dataStatus = DataReg;
+                break;
+        }
     }
 
     void
     setData(const RegClass &reg_class, const void *val)
     {
         new(&data.asReg) InstResult(reg_class, val);
-        dataStatus = DataReg;
+        switch (reg_class.type()) {
+            case IntRegClass:
+            case MiscRegClass:
+            case CCRegClass:
+                dataStatus = DataInt64;
+                break;
+            case FloatRegClass:
+                dataStatus = DataDouble;
+                break;
+            default:
+                dataStatus = DataReg;
+                break;
+        }
     }
 
     void
diff --git a/src/sim/process.cc b/src/sim/process.cc
index f47dbd59c6..7b6e2d0de2 100644
--- a/src/sim/process.cc
+++ b/src/sim/process.cc
@@ -401,7 +401,7 @@ Process::unserialize(CheckpointIn &cp)
 }
 
 bool
-Process::map(Addr vaddr, Addr paddr, int size, bool cacheable)
+Process::map(Addr vaddr, Addr paddr, int64_t size, bool cacheable)
 {
     pTable->map(vaddr, paddr, size,
                 cacheable ? EmulationPageTable::MappingFlags(0) :
diff --git a/src/sim/process.hh b/src/sim/process.hh
index d6d30cebc8..9d04ccd094 100644
--- a/src/sim/process.hh
+++ b/src/sim/process.hh
@@ -158,7 +158,7 @@ class Process : public SimObject
      * @return True if the map operation was successful.  (At this
      *           point in time, the map operation always succeeds.)
      */
-    bool map(Addr vaddr, Addr paddr, int size, bool cacheable = true);
+    bool map(Addr vaddr, Addr paddr, int64_t size, bool cacheable = true);
 
     void replicatePage(Addr vaddr, Addr new_paddr, ThreadContext *old_tc,
                        ThreadContext *new_tc, bool alloc_page);
diff --git a/src/sim/pseudo_inst.cc b/src/sim/pseudo_inst.cc
index 29caba6661..084be5a419 100644
--- a/src/sim/pseudo_inst.cc
+++ b/src/sim/pseudo_inst.cc
@@ -256,27 +256,27 @@ loadsymbol(ThreadContext *tc)
 }
 
 void
-addsymbol(ThreadContext *tc, Addr addr, Addr symbolAddr)
+addsymbol(ThreadContext *tc, GuestAddr addr, GuestAddr symbolAddr)
 {
     DPRINTF(PseudoInst, "pseudo_inst::addsymbol(0x%x, 0x%x)\n",
-            addr, symbolAddr);
+            addr.addr, symbolAddr.addr);
 
     std::string symbol;
     TranslatingPortProxy fs_proxy(tc);
     SETranslatingPortProxy se_proxy(tc);
     PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
 
-    virt_proxy.readString(symbol, symbolAddr);
+    virt_proxy.readString(symbol, symbolAddr.addr);
 
-    DPRINTF(Loader, "Loaded symbol: %s @ %#llx\n", symbol, addr);
+    DPRINTF(Loader, "Loaded symbol: %s @ %#llx\n", symbol, addr.addr);
 
     tc->getSystemPtr()->workload->insertSymbol(
         { loader::Symbol::Binding::Global,
-          loader::Symbol::SymbolType::Function, symbol, addr }
+          loader::Symbol::SymbolType::Function, symbol, addr.addr }
     );
     loader::debugSymbolTable.insert(
         { loader::Symbol::Binding::Global,
-          loader::Symbol::SymbolType::Function, symbol, addr }
+          loader::Symbol::SymbolType::Function, symbol, addr.addr }
     );
 }
 
@@ -368,10 +368,10 @@ m5checkpoint(ThreadContext *tc, Tick delay, Tick period)
 }
 
 uint64_t
-readfile(ThreadContext *tc, Addr vaddr, uint64_t len, uint64_t offset)
+readfile(ThreadContext *tc, GuestAddr vaddr, uint64_t len, uint64_t offset)
 {
     DPRINTF(PseudoInst, "pseudo_inst::readfile(0x%x, 0x%x, 0x%x)\n",
-            vaddr, len, offset);
+            vaddr.addr, len, offset);
 
     const std::string &file = tc->getSystemPtr()->params().readfile;
     if (file.empty()) {
@@ -404,17 +404,17 @@ readfile(ThreadContext *tc, Addr vaddr, uint64_t len, uint64_t offset)
     SETranslatingPortProxy se_proxy(tc);
     PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
 
-    virt_proxy.writeBlob(vaddr, buf, result);
+    virt_proxy.writeBlob(vaddr.addr, buf, result);
     delete [] buf;
     return result;
 }
 
 uint64_t
-writefile(ThreadContext *tc, Addr vaddr, uint64_t len, uint64_t offset,
-            Addr filename_addr)
+writefile(ThreadContext *tc, GuestAddr vaddr, uint64_t len, uint64_t offset,
+            GuestAddr filename_addr)
 {
     DPRINTF(PseudoInst, "pseudo_inst::writefile(0x%x, 0x%x, 0x%x, 0x%x)\n",
-            vaddr, len, offset, filename_addr);
+            vaddr.addr, len, offset, filename_addr.addr);
 
     // copy out target filename
     std::string filename;
@@ -422,7 +422,7 @@ writefile(ThreadContext *tc, Addr vaddr, uint64_t len, uint64_t offset,
     SETranslatingPortProxy se_proxy(tc);
     PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
 
-    virt_proxy.readString(filename, filename_addr);
+    virt_proxy.readString(filename, filename_addr.addr);
 
     OutputStream *out;
     if (offset == 0) {
@@ -448,7 +448,7 @@ writefile(ThreadContext *tc, Addr vaddr, uint64_t len, uint64_t offset,
     // copy out data and write to file
     char *buf = new char[len];
 
-    virt_proxy.readBlob(vaddr, buf, len);
+    virt_proxy.readBlob(vaddr.addr, buf, len);
     os->write(buf, len);
     if (os->fail() || os->bad())
         panic("Error while doing writefile!\n");
diff --git a/src/sim/pseudo_inst.hh b/src/sim/pseudo_inst.hh
index ba15370c55..2d7b27715b 100644
--- a/src/sim/pseudo_inst.hh
+++ b/src/sim/pseudo_inst.hh
@@ -64,18 +64,37 @@ decodeAddrOffset(Addr offset, uint8_t &func)
     func = bits(offset, 15, 8);
 }
 
+/**
+ * This struct wrapper for Addr enables m5ops for systems with 32 bit pointer,
+ * since it allows to distinguish between address arguments and native C++
+ * types. GuestAddr is only a temporary solution and will likely replaced in
+ * the future.
+*/
+struct GuestAddr
+{
+    Addr addr;
+    /** Constructor is necessary to cast from uint64_t to GuestAddr. */
+    GuestAddr(Addr _addr) : addr(_addr) {}
+};
+
+inline std::ostream&
+operator<<(std::ostream& os, const GuestAddr addr)
+{
+    return os << addr.addr;
+}
+
 void arm(ThreadContext *tc);
 void quiesce(ThreadContext *tc);
 void quiesceSkip(ThreadContext *tc);
 void quiesceNs(ThreadContext *tc, uint64_t ns);
 void quiesceCycles(ThreadContext *tc, uint64_t cycles);
 uint64_t quiesceTime(ThreadContext *tc);
-uint64_t readfile(ThreadContext *tc, Addr vaddr, uint64_t len,
+uint64_t readfile(ThreadContext *tc, GuestAddr vaddr, uint64_t len,
     uint64_t offset);
-uint64_t writefile(ThreadContext *tc, Addr vaddr, uint64_t len,
-    uint64_t offset, Addr filenameAddr);
+uint64_t writefile(ThreadContext *tc, GuestAddr vaddr, uint64_t len,
+    uint64_t offset, GuestAddr filenameAddr);
 void loadsymbol(ThreadContext *xc);
-void addsymbol(ThreadContext *tc, Addr addr, Addr symbolAddr);
+void addsymbol(ThreadContext *tc, GuestAddr addr, GuestAddr symbolAddr);
 uint64_t initParam(ThreadContext *xc, uint64_t key_str1, uint64_t key_str2);
 uint64_t rpns(ThreadContext *tc);
 void wakeCPU(ThreadContext *tc, uint64_t cpuid);
@@ -95,6 +114,7 @@ void m5Syscall(ThreadContext *tc);
 void togglesync(ThreadContext *tc);
 void triggerWorkloadEvent(ThreadContext *tc);
 
+
 /**
  * Execute a decoded M5 pseudo instruction
  *
diff --git a/src/sim/syscall_abi.hh b/src/sim/syscall_abi.hh
index 90dbd9747b..c50b1de16e 100644
--- a/src/sim/syscall_abi.hh
+++ b/src/sim/syscall_abi.hh
@@ -32,6 +32,7 @@
 #include "base/types.hh"
 #include "cpu/thread_context.hh"
 #include "sim/guest_abi.hh"
+#include "sim/pseudo_inst.hh"
 #include "sim/syscall_return.hh"
 
 namespace gem5
@@ -83,14 +84,15 @@ template <typename ABI, typename Arg>
 struct Argument<ABI, Arg,
     typename std::enable_if_t<
         std::is_base_of_v<GenericSyscallABI64, ABI> &&
-        std::is_integral_v<Arg>>>
+        (std::is_integral_v<Arg> ||
+         std::is_same<Arg,pseudo_inst::GuestAddr>::value)>>
 {
     static Arg
     get(ThreadContext *tc, typename ABI::State &state)
     {
         panic_if(state >= ABI::ArgumentRegs.size(),
                 "Ran out of syscall argument registers.");
-        return tc->getReg(ABI::ArgumentRegs[state++]);
+        return (Arg)tc->getReg(ABI::ArgumentRegs[state++]);
     }
 };
 
diff --git a/src/sim/syscall_emul.hh b/src/sim/syscall_emul.hh
index 97749f325d..4c822d14a1 100644
--- a/src/sim/syscall_emul.hh
+++ b/src/sim/syscall_emul.hh
@@ -57,6 +57,7 @@
 /// application on the host machine.
 
 #if defined(__linux__)
+#include <linux/kdev_t.h>
 #include <sched.h>
 #include <sys/eventfd.h>
 #include <sys/statfs.h>
@@ -677,6 +678,66 @@ copyOutStatfsBuf(TgtStatPtr tgt, HostStatPtr host)
 #endif
 }
 
+template <typename OS, typename TgtStatPtr, typename HostStatPtr>
+void
+copyOutStatxBuf(TgtStatPtr tgt, HostStatPtr host, bool fakeTTY = false)
+{
+    constexpr ByteOrder bo = OS::byteOrder;
+
+    if (fakeTTY) {
+        tgt->stx_dev_major = 0x00;
+        tgt->stx_dev_minor = 0x0A;
+    } else {
+        tgt->stx_dev_major = host->st_dev >> 8;
+        tgt->stx_dev_minor = host->st_dev & 0xFF;
+    }
+    tgt->stx_dev_major = htog(tgt->stx_dev_major, bo);
+    tgt->stx_dev_minor = htog(tgt->stx_dev_minor, bo);
+    tgt->stx_ino = host->st_ino;
+    tgt->stx_ino = htog(tgt->stx_ino, bo);
+    tgt->stx_mode = host->st_mode;
+    if (fakeTTY) {
+      // Claim to be character device.
+      tgt->stx_mode &= ~S_IFMT;
+      tgt->stx_mode |= S_IFCHR;
+    }
+    tgt->stx_mode = htog(tgt->stx_mode, bo);
+    tgt->stx_nlink = host->st_nlink;
+    tgt->stx_nlink = htog(tgt->stx_nlink, bo);
+    tgt->stx_uid = host->st_uid;
+    tgt->stx_uid = htog(tgt->stx_uid, bo);
+    tgt->stx_gid = host->st_gid;
+    tgt->stx_gid = htog(tgt->stx_gid, bo);
+    if (fakeTTY) {
+        tgt->stx_rdev_major = 0x880d >> 8;
+        tgt->stx_rdev_minor = 0x880d & 0xFF;
+    } else {
+        tgt->stx_rdev_major = host->st_rdev >> 8;
+        tgt->stx_rdev_minor = host->st_rdev & 0xFF;
+    }
+    tgt->stx_rdev_major = htog(tgt->stx_rdev_major, bo);
+    tgt->stx_rdev_minor = htog(tgt->stx_rdev_minor, bo);
+    tgt->stx_size = host->st_size;
+    tgt->stx_size = htog(tgt->stx_size, bo);
+    tgt->stx_atimeX = host->st_atime;
+    tgt->stx_atimeX = htog(tgt->stx_atimeX, bo);
+    tgt->stx_ctimeX = host->st_ctime;
+    tgt->stx_ctimeX = htog(tgt->stx_ctimeX, bo);
+    tgt->stx_mtimeX = host->st_mtime;
+    tgt->stx_mtimeX = htog(tgt->stx_mtimeX, bo);
+    // Force the block size to be 8KB. This helps to ensure buffered io works
+    // consistently across different hosts.
+    tgt->stx_blksize = 0x2000;
+    tgt->stx_blksize = htog(tgt->stx_blksize, bo);
+    tgt->stx_blocks = host->st_blocks;
+    tgt->stx_blocks = htog(tgt->stx_blocks, bo);
+    tgt->stx_mask = 0x000007ffU; // STATX_BASIC_STATS on Linux.
+    tgt->stx_mask = htog(tgt->stx_mask, bo);
+    tgt->stx_attributes = 0;
+    tgt->stx_attributes_mask = 0;
+    tgt->stx_attributes_mask = htog(tgt->stx_attributes_mask, bo);
+}
+
 /// Target ioctl() handler.  For the most part, programs call ioctl()
 /// only to find out if their stdout is a tty, to determine whether to
 /// do line or block buffering.  We always claim that output fds are
@@ -1456,6 +1517,45 @@ stat64Func(SyscallDesc *desc, ThreadContext *tc,
     return fstatat64Func<OS>(desc, tc, OS::TGT_AT_FDCWD, pathname, tgt_stat);
 }
 
+/// Target statx() handler.
+template <class OS>
+SyscallReturn
+statxFunc(SyscallDesc *desc, ThreadContext *tc,
+          int dirfd, VPtr<> pathname, int flags,
+          unsigned int mask, VPtr<typename OS::tgt_statx> tgt_statx)
+{
+    std::string path;
+
+    if (!SETranslatingPortProxy(tc).tryReadString(path, pathname))
+        return -EFAULT;
+
+    if (path.empty() && !(flags & OS::TGT_AT_EMPTY_PATH))
+        return -ENOENT;
+    flags = flags & ~OS::TGT_AT_EMPTY_PATH;
+
+    warn_if(flags != 0, "statx: Flag bits %#x not supported.", flags);
+
+    // Modifying path from the directory descriptor
+    if (auto res = atSyscallPath<OS>(tc, dirfd, path); !res.successful()) {
+        return res;
+    }
+
+    auto p = tc->getProcessPtr();
+
+    // Adjust path for cwd and redirection
+    path = p->checkPathRedirect(path);
+
+    struct stat host_buf;
+    int result = stat(path.c_str(), &host_buf);
+
+    if (result < 0)
+        return -errno;
+
+    copyOutStatxBuf<OS>(tgt_statx, &host_buf);
+
+    return 0;
+}
+
 /// Target fstat64() handler.
 template <class OS>
 SyscallReturn
diff --git a/src/sim/workload.hh b/src/sim/workload.hh
index 727a283fd4..9dc7a9a07c 100644
--- a/src/sim/workload.hh
+++ b/src/sim/workload.hh
@@ -103,6 +103,8 @@ class Workload : public SimObject
     virtual Addr getEntry() const = 0;
     virtual ByteOrder byteOrder() const = 0;
     virtual loader::Arch getArch() const = 0;
+    /// Returns the semihosting interface if supported by the current workload.
+    virtual SimObject* getSemihosting() const { return nullptr; }
 
     virtual const loader::SymbolTable &symtab(ThreadContext *tc) = 0;
     virtual bool insertSymbol(const loader::Symbol &symbol) = 0;
diff --git a/src/systemc/tests/systemc/utils/sc_report/cached/cached.cpp b/src/systemc/tests/systemc/utils/sc_report/cached/cached.cpp
index 8bddd90010..dd055ea55a 100644
--- a/src/systemc/tests/systemc/utils/sc_report/cached/cached.cpp
+++ b/src/systemc/tests/systemc/utils/sc_report/cached/cached.cpp
@@ -95,7 +95,7 @@ void dump_cached_report(const char* ctx)
 
 SC_MODULE( M )
 {
-    sc_in<bool> emit;  // 1: emit, 0: dump cahced report
+    sc_in<bool> emit;  // 1: emit, 0: dump cached report
     sc_in<const char*> id;
     sc_in<bool> ofs; 
     sc_event t1, t2, m1, m2;
diff --git a/src/systemc/tlm_bridge/gem5_to_tlm.cc b/src/systemc/tlm_bridge/gem5_to_tlm.cc
index 461be11051..daa7779181 100644
--- a/src/systemc/tlm_bridge/gem5_to_tlm.cc
+++ b/src/systemc/tlm_bridge/gem5_to_tlm.cc
@@ -253,15 +253,13 @@ Gem5ToTlmBridge<BITWIDTH>::pec(
         if (need_retry) {
             blockingResponse = &trans;
         } else {
-            if (phase == tlm::BEGIN_RESP) {
-                // Send END_RESP and we're finished:
-                tlm::tlm_phase fw_phase = tlm::END_RESP;
-                sc_core::sc_time delay = sc_core::SC_ZERO_TIME;
-                socket->nb_transport_fw(trans, fw_phase, delay);
-                // Release the transaction with all the extensions.
-                packetMap.erase(&trans);
-                trans.release();
-            }
+            // Send END_RESP and we're finished:
+            tlm::tlm_phase fw_phase = tlm::END_RESP;
+            sc_core::sc_time delay = sc_core::SC_ZERO_TIME;
+            socket->nb_transport_fw(trans, fw_phase, delay);
+            // Release the transaction with all the extensions.
+            packetMap.erase(&trans);
+            trans.release();
         }
     }
 }
diff --git a/src/systemc/tlm_bridge/tlm_to_gem5.cc b/src/systemc/tlm_bridge/tlm_to_gem5.cc
index 036ca738ca..0830fc841e 100644
--- a/src/systemc/tlm_bridge/tlm_to_gem5.cc
+++ b/src/systemc/tlm_bridge/tlm_to_gem5.cc
@@ -299,6 +299,24 @@ TlmToGem5Bridge<BITWIDTH>::destroyPacket(PacketPtr pkt)
     delete pkt;
 }
 
+template <unsigned int BITWIDTH>
+void
+TlmToGem5Bridge<BITWIDTH>::cacheBackdoor(gem5::MemBackdoorPtr backdoor)
+{
+    if (backdoor == nullptr) return;
+
+    // We only need to register the callback at the first time.
+    if (requestedBackdoors.find(backdoor) == requestedBackdoors.end()) {
+        backdoor->addInvalidationCallback(
+            [this](const MemBackdoor &backdoor)
+            {
+                invalidateDmi(backdoor);
+            }
+        );
+        requestedBackdoors.emplace(backdoor);
+    }
+}
+
 template <unsigned int BITWIDTH>
 void
 TlmToGem5Bridge<BITWIDTH>::invalidateDmi(const gem5::MemBackdoor &backdoor)
@@ -360,9 +378,29 @@ TlmToGem5Bridge<BITWIDTH>::b_transport(tlm::tlm_generic_payload &trans,
     pkt->pushSenderState(new Gem5SystemC::TlmSenderState(trans));
 
     MemBackdoorPtr backdoor = nullptr;
-    Tick ticks = bmp.sendAtomicBackdoor(pkt, backdoor);
-    if (backdoor)
+    Tick ticks = 0;
+
+    // Check if we have a backdoor meet the request. If yes, we can just hints
+    // the requestor the DMI is supported.
+    for (auto& b : requestedBackdoors) {
+        if (pkt->getAddrRange().isSubset(b->range()) &&
+            ((!pkt->isWrite() && b->readable()) ||
+             (pkt->isWrite() && b->writeable()))) {
+            backdoor = b;
+        }
+    }
+
+    if (backdoor) {
+        ticks = bmp.sendAtomic(pkt);
+    } else {
+        ticks = bmp.sendAtomicBackdoor(pkt, backdoor);
+    }
+
+    // Hints the requestor the DMI is supported.
+    if (backdoor) {
         trans.set_dmi_allowed(true);
+        cacheBackdoor(backdoor);
+    }
 
     // send an atomic request to gem5
     panic_if(pkt->needsResponse() && !pkt->isResponse(),
@@ -450,17 +488,7 @@ TlmToGem5Bridge<BITWIDTH>::get_direct_mem_ptr(tlm::tlm_generic_payload &trans,
         if (backdoor->writeable())
             access = (access_t)(access | tlm::tlm_dmi::DMI_ACCESS_WRITE);
         dmi_data.set_granted_access(access);
-
-        // We only need to register the callback at the first time.
-        if (requestedBackdoors.find(backdoor) == requestedBackdoors.end()) {
-            backdoor->addInvalidationCallback(
-                [this](const MemBackdoor &backdoor)
-                {
-                    invalidateDmi(backdoor);
-                }
-            );
-            requestedBackdoors.emplace(backdoor);
-        }
+        cacheBackdoor(backdoor);
     }
 
     trans.set_response_status(tlm::TLM_OK_RESPONSE);
diff --git a/src/systemc/tlm_bridge/tlm_to_gem5.hh b/src/systemc/tlm_bridge/tlm_to_gem5.hh
index 32c477e6f2..baa58be768 100644
--- a/src/systemc/tlm_bridge/tlm_to_gem5.hh
+++ b/src/systemc/tlm_bridge/tlm_to_gem5.hh
@@ -143,6 +143,8 @@ class TlmToGem5Bridge : public TlmToGem5BridgeBase
 
     void invalidateDmi(const gem5::MemBackdoor &backdoor);
 
+    void cacheBackdoor(gem5::MemBackdoorPtr backdoor);
+
   protected:
     // payload event call back
     void peq_cb(tlm::tlm_generic_payload &trans, const tlm::tlm_phase &phase);
diff --git a/src/arch/amdgpu/gcn3/Kconfig b/src/test_objects/Kconfig
similarity index 88%
rename from src/arch/amdgpu/gcn3/Kconfig
rename to src/test_objects/Kconfig
index 2ba21a6521..7a372c73db 100644
--- a/src/arch/amdgpu/gcn3/Kconfig
+++ b/src/test_objects/Kconfig
@@ -1,4 +1,5 @@
-# Copyright 2022 Google LLC
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
@@ -23,11 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-config TARGET_GPU_ISA
-    default "gcn3" if GCN3_GPU_ISA
-
-cont_choice "TARGET_GPU_ISA"
-    config GCN3_GPU_ISA
-        depends on BUILD_GPU
-        bool "GCN3"
-endchoice
+config USE_TEST_OBJECTS
+    bool "SimObjects used for testing the gem5 simulator"
+    default n
diff --git a/src/test_objects/SConscript b/src/test_objects/SConscript
new file mode 100644
index 0000000000..3bcd695f9f
--- /dev/null
+++ b/src/test_objects/SConscript
@@ -0,0 +1,37 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import ("*")
+
+if env['CONF']['USE_TEST_OBJECTS']:
+    SimObject('StatTester.py', sim_objects=[
+        'StatTester',
+        'ScalarStatTester',
+        'VectorStatTester',
+        'Vector2dStatTester',
+        'SparseHistStatTester',
+    ])
+    Source('stat_tester.cc')
diff --git a/src/test_objects/StatTester.py b/src/test_objects/StatTester.py
new file mode 100644
index 0000000000..ad34e411d7
--- /dev/null
+++ b/src/test_objects/StatTester.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.SimObject import SimObject
+
+
+class StatTester(SimObject):
+    type = "StatTester"
+    abstract = True
+    cxx_header = "test_objects/stat_tester.hh"
+    cxx_class = "gem5::StatTester"
+
+    name = Param.String("stat", "Name of the stat.")
+    description = Param.String("", "The  descriptionof the stat.")
+
+
+class ScalarStatTester(StatTester):
+    type = "ScalarStatTester"
+    cxx_header = "test_objects/stat_tester.hh"
+    cxx_class = "gem5::ScalarStatTester"
+
+    value = Param.Float("The scalar stat's value.")
+
+
+class VectorStatTester(StatTester):
+    type = "VectorStatTester"
+    cxx_header = "test_objects/stat_tester.hh"
+    cxx_class = "gem5::VectorStatTester"
+
+    values = VectorParam.Float("The vector stat's values.")
+    subnames = VectorParam.String(
+        [],
+        "The vector stat's subnames. If position is empty, index int is "
+        "used instead.",
+    )
+    subdescs = VectorParam.String(
+        [],
+        "The vector stat's subdescriptions. If empty, the subdescriptions "
+        "are not used.",
+    )
+
+
+class Vector2dStatTester(StatTester):
+    type = "Vector2dStatTester"
+    cxx_header = "test_objects/stat_tester.hh"
+    cxx_class = "gem5::Vector2dStatTester"
+
+    x_size = Param.Int("The number of elements in the x dimension.")
+    y_size = Param.Int("The number of elements in the y dimension.")
+
+    values = VectorParam.Float("The vector stat's values, flattened.")
+    subnames = VectorParam.String(
+        [],
+        "The vector stat's subnames. If position is empty, index int is "
+        "used instead.",
+    )
+    subdescs = VectorParam.String(
+        [],
+        "The vector stat's subdescriptions. If empty, the subdescriptions "
+        "are not used.",
+    )
+    ysubnames = VectorParam.String(
+        [],
+        "The vector stat's y subdescriptions. If empty, the subdescriptions ",
+    )
+
+
+class SparseHistStatTester(StatTester):
+    type = "SparseHistStatTester"
+    cxx_header = "test_objects/stat_tester.hh"
+    cxx_class = "gem5::SparseHistStatTester"
+
+    samples = VectorParam.Float(
+        "The sparse histogram's sampled values, to be inserted into the "
+        "histogram."
+    )
diff --git a/src/test_objects/stat_tester.cc b/src/test_objects/stat_tester.cc
new file mode 100644
index 0000000000..7e80aa8721
--- /dev/null
+++ b/src/test_objects/stat_tester.cc
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2024 The Regents of the University of California
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "test_objects/stat_tester.hh"
+
+#include <set>
+
+#include "base/stats/group.hh"
+
+namespace gem5
+{
+
+void
+ScalarStatTester::setStats()
+{
+    stats.scalar = params.value;
+}
+
+ScalarStatTester::ScalarStatTesterStats::ScalarStatTesterStats(
+    statistics::Group *parent,
+    const ScalarStatTesterParams &params
+) : statistics::Group(parent),
+    scalar(this,
+        params.name.c_str(),
+        statistics::units::Count::get(),
+        params.description.c_str()
+    )
+{
+}
+
+void
+VectorStatTester::setStats()
+{
+    for (int i = 0; i < params.values.size(); i++)
+    {
+        stats.vector[i] = (params.values[i]);
+    }
+}
+
+VectorStatTester::VectorStatTesterStats::VectorStatTesterStats(
+    statistics::Group *parent,
+    const VectorStatTesterParams &params
+) : statistics::Group(parent),
+    vector(this,
+        params.name.c_str(),
+        statistics::units::Count::get(),
+        params.description.c_str()
+    )
+{
+    vector.init(params.values.size());
+    for (int i = 0; i < params.values.size(); i++)
+    {
+        if (params.subnames.size() > i) {
+            vector.subname(i, params.subnames[i]);
+        } else {
+            vector.subname(i, std::to_string(i));
+        }
+        if (params.subdescs.size() > i) {
+            vector.subdesc(i, params.subdescs[i]);
+        }
+    }
+}
+
+void
+Vector2dStatTester::setStats()
+{
+    for (int i = 0; i < params.x_size; i++)
+    {
+        for (int j = 0; j < params.y_size; j++)
+        {
+            stats.vector2d[i][j] = (params.values[j + i * params.y_size]);
+        }
+    }
+}
+
+Vector2dStatTester::Vector2dStatTesterStats::Vector2dStatTesterStats(
+    statistics::Group *parent,
+    const Vector2dStatTesterParams &params
+) : statistics::Group(parent),
+    vector2d(this,
+        params.name.c_str(),
+        statistics::units::Count::get(),
+        params.description.c_str()
+    )
+{
+    vector2d.init(params.x_size, params.y_size);
+
+    assert(params.x_size * params.y_size == params.values.size());
+
+    for (int i = 0; i < params.x_size; i++)
+    {
+        if (params.subnames.size() > i) {
+            vector2d.subname(i, params.subnames[i]);
+        } else {
+            vector2d.subname(i, std::to_string(i));
+        }
+        if (params.subdescs.size() > i) {
+            vector2d.subdesc(i, params.subdescs[i]);
+        }
+    }
+    for (int j = 0; j < params.y_size; j++)
+    {
+        if (params.ysubnames.size() > j) {
+            vector2d.ysubname(j, params.ysubnames[j]);
+        } else {
+            vector2d.ysubname(j, std::to_string(j));
+        }
+
+    }
+
+}
+
+void
+SparseHistStatTester::setStats()
+{
+    for (auto sample : params.samples) {
+        stats.sparse_histogram.sample(sample);
+    }
+}
+
+SparseHistStatTester::SparseHistStatTesterStats::SparseHistStatTesterStats(
+    statistics::Group *parent,
+    const SparseHistStatTesterParams &params
+) : statistics::Group(parent),
+    sparse_histogram(
+        this,
+        params.name.c_str(),
+        statistics::units::Count::get(),
+        params.description.c_str()
+    )
+{
+    sparse_histogram.init(
+        (std::set(params.samples.begin(), params.samples.end())).size()
+    );
+}
+
+} // namespace gem5
diff --git a/src/test_objects/stat_tester.hh b/src/test_objects/stat_tester.hh
new file mode 100644
index 0000000000..5fafbdc420
--- /dev/null
+++ b/src/test_objects/stat_tester.hh
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2024 The Regents of the University of California
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __STAT_TESTER_HH__
+#define __STAT_TESTER_HH__
+
+#include "base/statistics.hh"
+#include "params/ScalarStatTester.hh"
+#include "params/SparseHistStatTester.hh"
+#include "params/StatTester.hh"
+#include "params/Vector2dStatTester.hh"
+#include "params/VectorStatTester.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+/**
+ * This classes are used to test the stats system from setting through to
+ * output. The value of the stats are set via the parameters and when
+ * included in a simulation will create these stats and update them at Tick
+ * zero.
+ *
+ * Example usage
+ * -------------
+ *
+ *     import m5
+ *     from m5.objects import (
+ *         Root,
+ *         ScalarStatTester,
+ *     )
+ *
+ *     stat_tester = ScalarStatTester()
+ *
+ *     stat_tester.value = 42
+ *     stat_tester.name = "example"
+ *     root = Root(full_system=False)
+ *    root.stat_tester = stat_tester
+ *
+ *     m5.instantiate()
+ *     m5.simulate()
+ *
+ * When the above is run, the m5out/stats.txt file will contain the
+ * following:
+ *
+ *     ---------- Begin Simulation Statistics ----------
+ *     simSeconds                               18446744.073710
+ *     simTicks                                 18446744073709551616
+ *     finalTick                                18446744073709551616
+ *     simFreq                                  1000000000000
+ *     hostSeconds                                      0.00
+ *     hostTickRate                             101355736668733798612992
+ *     hostMemory                                  403847696
+ *     stat_tester.example                                42
+ *     ---------- End Simulation Statistics   ----------
+ */
+class StatTester : public SimObject
+{
+  private:
+    EventFunctionWrapper event;
+
+  public:
+    StatTester(const StatTesterParams &p) :
+        SimObject(p),
+        event([this]{ setStats(); }, name() + ".event")
+    {}
+    void startup() override { schedule(event, curTick()); }
+
+  protected:
+    virtual void setStats(){}
+};
+
+class ScalarStatTester : public StatTester
+{
+
+  private:
+    ScalarStatTesterParams params;
+
+  public:
+    ScalarStatTester(const ScalarStatTesterParams &p) :
+        StatTester(p), params(p), stats(this, p) {}
+
+  protected:
+    void setStats() override;
+    struct ScalarStatTesterStats : public statistics::Group
+    {
+        ScalarStatTesterStats(
+            statistics::Group *parent,
+            const ScalarStatTesterParams &params
+        );
+        statistics::Scalar scalar;
+    } stats;
+};
+
+class VectorStatTester : public StatTester
+{
+    private:
+        VectorStatTesterParams params;
+
+    public:
+        VectorStatTester(const VectorStatTesterParams &p) :
+            StatTester(p), params(p), stats(this, p) {}
+
+    protected:
+        void setStats() override;
+        struct VectorStatTesterStats : public statistics::Group
+        {
+            VectorStatTesterStats(
+                statistics::Group *parent,
+                const VectorStatTesterParams &params
+            );
+            statistics::Vector vector;
+        } stats;
+};
+
+class Vector2dStatTester : public StatTester
+{
+    private:
+        Vector2dStatTesterParams params;
+
+    public:
+        Vector2dStatTester(const Vector2dStatTesterParams &p) :
+            StatTester(p), params(p), stats(this, p) {}
+
+    protected:
+        void setStats() override;
+        struct Vector2dStatTesterStats : public statistics::Group
+        {
+            Vector2dStatTesterStats(
+                statistics::Group *parent,
+                const Vector2dStatTesterParams &params
+            );
+            statistics::Vector2d vector2d;
+        } stats;
+};
+
+class SparseHistStatTester : public StatTester
+{
+    private:
+        SparseHistStatTesterParams params;
+
+    public:
+        SparseHistStatTester(const SparseHistStatTesterParams &p) :
+            StatTester(p), params(p), stats(this, p) {}
+
+    protected:
+        void setStats() override;
+        struct SparseHistStatTesterStats : public statistics::Group
+        {
+            SparseHistStatTesterStats(
+                statistics::Group *parent,
+                const SparseHistStatTesterParams &params
+            );
+            statistics::SparseHistogram sparse_histogram;
+        } stats;
+};
+
+
+} // namespace gem5
+
+#endif // __STAT_TESTER_HH__
diff --git a/tests/deprecated/compiler-tests.sh b/tests/deprecated/compiler-tests.sh
index e8da335de4..1af1878cf7 100755
--- a/tests/deprecated/compiler-tests.sh
+++ b/tests/deprecated/compiler-tests.sh
@@ -46,7 +46,6 @@ builds=("ALL"
         "ARM_MESI_Three_Level_HTM"
         "ARM_MOESI_hammer"
         "Garnet_standalone"
-        "GCN3_X86"
         "MIPS"
         "NULL"
         "NULL_MESI_Two_Level"
@@ -56,7 +55,6 @@ builds=("ALL"
         "POWER"
         "RISCV"
         "SPARC"
-        "GCN3_X86"
         "VEGA_X86"
         "X86"
         "X86_MI_example"
diff --git a/tests/deprecated/nightly.sh b/tests/deprecated/nightly.sh
index 53ad374c3c..e0899cad08 100755
--- a/tests/deprecated/nightly.sh
+++ b/tests/deprecated/nightly.sh
@@ -54,13 +54,13 @@ if [[ $# -gt 1 ]]; then
 fi
 
 # The third argument is the GPU ISA to run. If no argument is given we default
-# to GCN3_X86.
-gpu_isa=GCN3_X86
+# to VEGA_X86.
+gpu_isa=VEGA_X86
 if [[ $# -gt 2 ]]; then
     gpu_isa=$3
 fi
 
-if [[ "$gpu_isa" != "GCN3_X86" ]] && [[ "$gpu_isa" != "VEGA_X86" ]]; then
+if [[ "$gpu_isa" != "VEGA_X86" ]]; then
     echo "Invalid GPU ISA: $gpu_isa"
     exit 1
 fi
diff --git a/tests/deprecated/weekly.sh b/tests/deprecated/weekly.sh
index 17d68426a6..d45d31eb25 100755
--- a/tests/deprecated/weekly.sh
+++ b/tests/deprecated/weekly.sh
@@ -42,10 +42,10 @@ tag="latest"
 # We assume the first three arguments are the number of threads to use for
 # compilation followed by the GPU ISA to test, and finally, the number of
 # "run threads", the maximum number of tests to be run at once. By default the
-# number of compile threads 1 and the GPU ISA is GCN3_X86. The number of
+# number of compile threads 1 and the GPU ISA is VEGA_X86. The number of
 # "run threads" is equal to the number of compile threads by default.
 threads=1
-gpu_isa=GCN3_X86
+gpu_isa=VEGA_X86
 run_threads=1
 if [[ $# -eq 1 ]]; then
     threads=$1
@@ -64,7 +64,7 @@ else
     fi
 fi
 
-if [[ "$gpu_isa" != "GCN3_X86" ]] && [[ "$gpu_isa" != "VEGA_X86" ]]; then
+if [[ "$gpu_isa" != "VEGA_X86" ]]; then
     echo "Invalid GPU ISA: $gpu_isa"
     exit 1
 fi
@@ -170,13 +170,13 @@ docker run --rm -u $UID:$GID --volume "${gem5_root}":"${gem5_root}" -w \
      --memory="${docker_mem_limit}" hacc-test-weekly bash -c \
      "make -j${threads}"
 
-# generate cachefiles -- since we are testing gfx801 and 4 CUs (default config)
+# generate cachefiles -- since we are testing gfx902 and 4 CUs (default config)
 # in tester, we want cachefiles for this setup
 docker run --rm --volume "${gem5_root}":"${gem5_root}" -w \
     "${gem5_root}/gem5-resources/src/gpu/DNNMark" \
     "-v${gem5_root}/gem5-resources/src/gpu/DNNMark/cachefiles:/root/.cache/miopen/2.9.0" \
     --memory="${docker_mem_limit}" hacc-test-weekly bash -c \
-    "python3 generate_cachefiles.py cachefiles.csv --gfx-version=gfx801 \
+    "python3 generate_cachefiles.py cachefiles.csv --gfx-version=gfx902 \
     --num-cus=4"
 
 # generate mmap data for DNNMark (makes simulation much faster)
diff --git a/tests/gem5/arm_boot_tests/configs/arm_boot_exit_run.py b/tests/gem5/arm_boot_tests/configs/arm_boot_exit_run.py
index 61931d1a7e..8690d35e3b 100644
--- a/tests/gem5/arm_boot_tests/configs/arm_boot_exit_run.py
+++ b/tests/gem5/arm_boot_tests/configs/arm_boot_exit_run.py
@@ -205,23 +205,26 @@ board.set_kernel_disk_workload(
     kernel=obtain_resource(
         "arm64-linux-kernel-5.4.49",
         resource_directory=args.resource_directory,
+        resource_version="1.0.0",
     ),
     bootloader=obtain_resource(
         "arm64-bootloader-foundation",
         resource_directory=args.resource_directory,
+        resource_version="1.0.0",
     ),
     disk_image=obtain_resource(
         "arm64-ubuntu-20.04-img",
         resource_directory=args.resource_directory,
+        resource_version="1.0.0",
     ),
 )
 
 simulator = Simulator(board=board)
 
 if args.tick_exit:
-    simulator.run(max_ticks=args.tick_exit)
-else:
-    simulator.run()
+    simulator.set_max_ticks(args.tick_exit)
+
+simulator.run()
 
 print(
     "Exiting @ tick {} because {}.".format(
diff --git a/tests/gem5/asmtest/configs/riscv_asmtest.py b/tests/gem5/asmtest/configs/riscv_asmtest.py
index d6582e4698..578dfaa976 100644
--- a/tests/gem5/asmtest/configs/riscv_asmtest.py
+++ b/tests/gem5/asmtest/configs/riscv_asmtest.py
@@ -32,6 +32,7 @@ gem5 while still being functinal.
 """
 
 import argparse
+import sys
 
 from gem5.components.boards.simple_board import SimpleBoard
 from gem5.components.cachehierarchies.classic.no_cache import NoCache
@@ -119,3 +120,5 @@ print(
         simulator.get_current_tick(), simulator.get_last_exit_event_cause()
     )
 )
+
+sys.exit(simulator.get_last_exit_event_code())
diff --git a/tests/gem5/asmtest/tests.py b/tests/gem5/asmtest/tests.py
index 62e4ef5859..6b59b67cdb 100644
--- a/tests/gem5/asmtest/tests.py
+++ b/tests/gem5/asmtest/tests.py
@@ -69,49 +69,6 @@ rv64_binaries = (
     "rv64uamt-ps-amoswap_d",
     "rv64uamt-ps-amoxor_d",
     "rv64uamt-ps-lrsc_d",
-    "rv64ub-ps-add_uw",
-    "rv64ub-ps-andn",
-    "rv64ub-ps-bclr",
-    "rv64ub-ps-bclri",
-    "rv64ub-ps-bext",
-    "rv64ub-ps-bexti",
-    "rv64ub-ps-binv",
-    "rv64ub-ps-binvi",
-    "rv64ub-ps-bset",
-    "rv64ub-ps-bseti",
-    "rv64ub-ps-clmul",
-    "rv64ub-ps-clmulh",
-    "rv64ub-ps-clmulr",
-    "rv64ub-ps-clz",
-    "rv64ub-ps-clzw",
-    "rv64ub-ps-cpop",
-    "rv64ub-ps-cpopw",
-    "rv64ub-ps-ctz",
-    "rv64ub-ps-ctzw",
-    "rv64ub-ps-max",
-    "rv64ub-ps-maxu",
-    "rv64ub-ps-min",
-    "rv64ub-ps-minu",
-    "rv64ub-ps-orc_b",
-    "rv64ub-ps-orn",
-    "rv64ub-ps-rev8",
-    "rv64ub-ps-rol",
-    "rv64ub-ps-rolw",
-    "rv64ub-ps-ror",
-    "rv64ub-ps-rori",
-    "rv64ub-ps-roriw",
-    "rv64ub-ps-rorw",
-    "rv64ub-ps-sext_b",
-    "rv64ub-ps-sext_h",
-    "rv64ub-ps-sh1add",
-    "rv64ub-ps-sh1add_uw",
-    "rv64ub-ps-sh2add",
-    "rv64ub-ps-sh2add_uw",
-    "rv64ub-ps-sh3add",
-    "rv64ub-ps-sh3add_uw",
-    "rv64ub-ps-slli_uw",
-    "rv64ub-ps-xnor",
-    "rv64ub-ps-zext_h",
     "rv64uc-ps-rvc",
     "rv64ud-ps-fadd",
     "rv64ud-ps-fclass",
@@ -211,6 +168,49 @@ rv64_binaries = (
     "rv64uzfh-ps-ldst",
     "rv64uzfh-ps-move",
     "rv64uzfh-ps-recoding",
+    "rv64uzba-ps-add_uw",
+    "rv64uzba-ps-sh1add",
+    "rv64uzba-ps-sh1add_uw",
+    "rv64uzba-ps-sh2add",
+    "rv64uzba-ps-sh2add_uw",
+    "rv64uzba-ps-sh3add",
+    "rv64uzba-ps-sh3add_uw",
+    "rv64uzba-ps-slli_uw",
+    "rv64uzbb-ps-andn",
+    "rv64uzbb-ps-clz",
+    "rv64uzbb-ps-clzw",
+    "rv64uzbb-ps-cpop",
+    "rv64uzbb-ps-cpopw",
+    "rv64uzbb-ps-ctz",
+    "rv64uzbb-ps-ctzw",
+    "rv64uzbb-ps-max",
+    "rv64uzbb-ps-maxu",
+    "rv64uzbb-ps-min",
+    "rv64uzbb-ps-minu",
+    "rv64uzbb-ps-orc_b",
+    "rv64uzbb-ps-orn",
+    "rv64uzbb-ps-rev8",
+    "rv64uzbb-ps-rol",
+    "rv64uzbb-ps-rolw",
+    "rv64uzbb-ps-ror",
+    "rv64uzbb-ps-rori",
+    "rv64uzbb-ps-roriw",
+    "rv64uzbb-ps-rorw",
+    "rv64uzbb-ps-sext_b",
+    "rv64uzbb-ps-sext_h",
+    "rv64uzbb-ps-xnor",
+    "rv64uzbb-ps-zext_h",
+    "rv64uzbc-ps-clmul",
+    "rv64uzbc-ps-clmulh",
+    "rv64uzbc-ps-clmulr",
+    "rv64uzbs-ps-bclr",
+    "rv64uzbs-ps-bclri",
+    "rv64uzbs-ps-bext",
+    "rv64uzbs-ps-bexti",
+    "rv64uzbs-ps-binv",
+    "rv64uzbs-ps-binvi",
+    "rv64uzbs-ps-bset",
+    "rv64uzbs-ps-bseti",
 )
 
 rv32_binaries = (
@@ -234,38 +234,6 @@ rv32_binaries = (
     "rv32uamt-ps-amoswap_w",
     "rv32uamt-ps-amoxor_w",
     "rv32uamt-ps-lrsc_w",
-    "rv32ub-ps-andn",
-    "rv32ub-ps-bclr",
-    "rv32ub-ps-bclri",
-    "rv32ub-ps-bext",
-    "rv32ub-ps-bexti",
-    "rv32ub-ps-binv",
-    "rv32ub-ps-binvi",
-    "rv32ub-ps-bset",
-    "rv32ub-ps-bseti",
-    "rv32ub-ps-clmul",
-    "rv32ub-ps-clmulh",
-    "rv32ub-ps-clmulr",
-    "rv32ub-ps-clz",
-    "rv32ub-ps-cpop",
-    "rv32ub-ps-ctz",
-    "rv32ub-ps-max",
-    "rv32ub-ps-maxu",
-    "rv32ub-ps-min",
-    "rv32ub-ps-minu",
-    "rv32ub-ps-orc_b",
-    "rv32ub-ps-orn",
-    "rv32ub-ps-rev8",
-    "rv32ub-ps-rol",
-    "rv32ub-ps-ror",
-    "rv32ub-ps-rori",
-    "rv32ub-ps-sext_b",
-    "rv32ub-ps-sext_h",
-    "rv32ub-ps-sh1add",
-    "rv32ub-ps-sh2add",
-    "rv32ub-ps-sh3add",
-    "rv32ub-ps-xnor",
-    "rv32ub-ps-zext_h",
     "rv32uc-ps-rvc",
     "rv32ud-ps-fadd",
     "rv32ud-ps-fclass",
@@ -346,6 +314,38 @@ rv32_binaries = (
     "rv32uzfh-ps-ldst",
     "rv32uzfh-ps-move",
     "rv32uzfh-ps-recoding",
+    "rv32uzba-ps-sh1add",
+    "rv32uzba-ps-sh2add",
+    "rv32uzba-ps-sh3add",
+    "rv32uzbb-ps-andn",
+    "rv32uzbb-ps-clz",
+    "rv32uzbb-ps-cpop",
+    "rv32uzbb-ps-ctz",
+    "rv32uzbb-ps-max",
+    "rv32uzbb-ps-maxu",
+    "rv32uzbb-ps-min",
+    "rv32uzbb-ps-minu",
+    "rv32uzbb-ps-orc_b",
+    "rv32uzbb-ps-orn",
+    "rv32uzbb-ps-rev8",
+    "rv32uzbb-ps-rol",
+    "rv32uzbb-ps-ror",
+    "rv32uzbb-ps-rori",
+    "rv32uzbb-ps-sext_b",
+    "rv32uzbb-ps-sext_h",
+    "rv32uzbb-ps-xnor",
+    "rv32uzbb-ps-zext_h",
+    "rv32uzbc-ps-clmul",
+    "rv32uzbc-ps-clmulh",
+    "rv32uzbc-ps-clmulr",
+    "rv32uzbs-ps-bclr",
+    "rv32uzbs-ps-bclri",
+    "rv32uzbs-ps-bext",
+    "rv32uzbs-ps-bexti",
+    "rv32uzbs-ps-binv",
+    "rv32uzbs-ps-binvi",
+    "rv32uzbs-ps-bset",
+    "rv32uzbs-ps-bseti",
 )
 
 cpu_types = ("atomic", "timing", "minor", "o3")
diff --git a/tests/gem5/checkpoint_tests/configs/arm-hello-restore-checkpoint.py b/tests/gem5/checkpoint_tests/configs/arm-hello-restore-checkpoint.py
index f5dfe4cfc8..de9474f444 100644
--- a/tests/gem5/checkpoint_tests/configs/arm-hello-restore-checkpoint.py
+++ b/tests/gem5/checkpoint_tests/configs/arm-hello-restore-checkpoint.py
@@ -67,8 +67,10 @@ board = SimpleBoard(
 )
 
 board.set_se_binary_workload(
-    obtain_resource("arm-hello64-static"),
-    checkpoint=obtain_resource("arm-hello-test-checkpoint"),
+    obtain_resource("arm-hello64-static", resource_version="1.0.0"),
+    checkpoint=obtain_resource(
+        "arm-hello-test-checkpoint", resource_version="1.0.0"
+    ),
 )
 
 sim = Simulator(board=board, full_system=False)
diff --git a/tests/gem5/checkpoint_tests/configs/arm-hello-save-checkpoint.py b/tests/gem5/checkpoint_tests/configs/arm-hello-save-checkpoint.py
index 16fc2a00f0..fac8083113 100644
--- a/tests/gem5/checkpoint_tests/configs/arm-hello-save-checkpoint.py
+++ b/tests/gem5/checkpoint_tests/configs/arm-hello-save-checkpoint.py
@@ -63,11 +63,16 @@ board = SimpleBoard(
     memory=memory,
     cache_hierarchy=cache_hierarchy,
 )
-board.set_se_binary_workload(obtain_resource("arm-hello64-static"))
+board.set_se_binary_workload(
+    obtain_resource(
+        "arm-hello64-static",
+        resource_version="1.0.0",
+    )
+)
 
-sim = Simulator(board=board, full_system=False)
-max_ticks = 10**6
-sim.run(max_ticks=max_ticks)
+sim = Simulator(board=board, full_system=False, max_ticks=10**6)
+
+sim.run()
 print(
     "Exiting @ tick {} because {}.".format(
         sim.get_current_tick(), sim.get_last_exit_event_cause()
diff --git a/tests/gem5/checkpoint_tests/configs/power-hello-restore-checkpoint.py b/tests/gem5/checkpoint_tests/configs/power-hello-restore-checkpoint.py
index bd717f3c62..6dd37bb5f7 100644
--- a/tests/gem5/checkpoint_tests/configs/power-hello-restore-checkpoint.py
+++ b/tests/gem5/checkpoint_tests/configs/power-hello-restore-checkpoint.py
@@ -61,8 +61,10 @@ board = SimpleBoard(
     cache_hierarchy=cache_hierarchy,
 )
 board.set_se_binary_workload(
-    obtain_resource("power-hello"),
-    checkpoint=obtain_resource("power-hello-test-checkpoint"),
+    obtain_resource("power-hello", resource_version="1.0.0"),
+    checkpoint=obtain_resource(
+        "power-hello-test-checkpoint-v24-0", resource_version="2.0.0"
+    ),
 )
 
 sim = Simulator(board=board, full_system=False)
diff --git a/tests/gem5/checkpoint_tests/configs/power-hello-save-checkpoint.py b/tests/gem5/checkpoint_tests/configs/power-hello-save-checkpoint.py
index 594d6610e0..5e86bd183f 100644
--- a/tests/gem5/checkpoint_tests/configs/power-hello-save-checkpoint.py
+++ b/tests/gem5/checkpoint_tests/configs/power-hello-save-checkpoint.py
@@ -69,11 +69,13 @@ board = SimpleBoard(
     memory=memory,
     cache_hierarchy=cache_hierarchy,
 )
-board.set_se_binary_workload(obtain_resource("power-hello"))
+board.set_se_binary_workload(
+    obtain_resource("power-hello", resource_version="1.0.0")
+)
 
-sim = Simulator(board=board, full_system=False)
-max_ticks = 10**6
-sim.run(max_ticks=max_ticks)
+sim = Simulator(board=board, full_system=False, max_ticks=10**6)
+
+sim.run()
 print(
     "Exiting @ tick {} because {}.".format(
         sim.get_current_tick(), sim.get_last_exit_event_cause()
diff --git a/tests/gem5/checkpoint_tests/configs/sparc-hello-save-checkpoint.py b/tests/gem5/checkpoint_tests/configs/sparc-hello-save-checkpoint.py
index f85ff41dbc..2a0cf29799 100644
--- a/tests/gem5/checkpoint_tests/configs/sparc-hello-save-checkpoint.py
+++ b/tests/gem5/checkpoint_tests/configs/sparc-hello-save-checkpoint.py
@@ -69,11 +69,15 @@ board = SimpleBoard(
     memory=memory,
     cache_hierarchy=cache_hierarchy,
 )
-board.set_se_binary_workload(obtain_resource("sparc-hello"))
+board.set_se_binary_workload(
+    obtain_resource(
+        "sparc-hello",
+        resource_version="1.0.0",
+    )
+)
 
-sim = Simulator(board=board, full_system=False)
-max_ticks = 10**6
-sim.run(max_ticks=max_ticks)
+sim = Simulator(board=board, full_system=False, max_ticks=10**6)
+sim.run()
 print(
     "Exiting @ tick {} because {}.".format(
         sim.get_current_tick(), sim.get_last_exit_event_cause()
diff --git a/tests/gem5/checkpoint_tests/configs/x86-fs-restore-checkpoint.py b/tests/gem5/checkpoint_tests/configs/x86-fs-restore-checkpoint.py
index f2909b36f6..799cfea651 100644
--- a/tests/gem5/checkpoint_tests/configs/x86-fs-restore-checkpoint.py
+++ b/tests/gem5/checkpoint_tests/configs/x86-fs-restore-checkpoint.py
@@ -34,8 +34,8 @@ This configuration serves as a test of restoring a checkpoint with X86 ISA in fs
 """
 
 from gem5.components.boards.x86_board import X86Board
-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )
 from gem5.components.memory import SingleChannelDDR3_1600
 from gem5.components.processors.cpu_types import CPUTypes
@@ -54,7 +54,7 @@ requires(isa_required=ISA.X86)
 # Setup the cache hierarchy.
 # For classic, PrivateL1PrivateL2 and NoCache have been tested.
 # For Ruby, MESI_Two_Level and MI_example have been tested.
-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
     l1d_size="32kB", l1i_size="32kB", l2_size="512kB"
 )
 
@@ -74,15 +74,21 @@ board = X86Board(
 
 # Set the Full System workload.
 board.set_kernel_disk_workload(
-    kernel=obtain_resource("x86-linux-kernel-5.4.49"),
-    disk_image=obtain_resource("x86-ubuntu-18.04-img"),
-    checkpoint=obtain_resource("x86-fs-test-checkpoint"),
+    kernel=obtain_resource(
+        "x86-linux-kernel-5.4.49", resource_version="1.0.0"
+    ),
+    disk_image=obtain_resource(
+        "x86-ubuntu-18.04-img", resource_version="1.0.0"
+    ),
+    checkpoint=obtain_resource(
+        "x86-fs-test-checkpoint-v24-0", resource_version="3.0.0"
+    ),
 )
 
-sim = Simulator(board=board, full_system=True)
+sim = Simulator(board=board, full_system=True, max_ticks=10**10)
 print("Beginning simulation!")
 
-sim.run(max_ticks=10**10)
+sim.run()
 
 print(
     "Exiting @ tick {} because {}.".format(
diff --git a/tests/gem5/checkpoint_tests/configs/x86-fs-save-checkpoint.py b/tests/gem5/checkpoint_tests/configs/x86-fs-save-checkpoint.py
index fb800209e0..4bb07941b8 100644
--- a/tests/gem5/checkpoint_tests/configs/x86-fs-save-checkpoint.py
+++ b/tests/gem5/checkpoint_tests/configs/x86-fs-save-checkpoint.py
@@ -34,8 +34,8 @@ with X86 ISA in fs mode.
 import argparse
 
 from gem5.components.boards.x86_board import X86Board
-from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-    PrivateL1PrivateL2CacheHierarchy,
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
 )
 from gem5.components.memory import SingleChannelDDR3_1600
 from gem5.components.processors.cpu_types import CPUTypes
@@ -63,7 +63,7 @@ requires(isa_required=ISA.X86)
 # Setup the cache hierarchy.
 # For classic, PrivateL1PrivateL2 and NoCache have been tested.
 # For Ruby, MESI_Two_Level and MI_example have been tested.
-cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
     l1d_size="32kB", l1i_size="32kB", l2_size="512kB"
 )
 
@@ -87,11 +87,11 @@ board.set_kernel_disk_workload(
     disk_image=obtain_resource("x86-ubuntu-18.04-img"),
 )
 
-sim = Simulator(board=board, full_system=True)
+sim = Simulator(board=board, full_system=True, max_ticks=10**6)
 print("Beginning simulation!")
 
-max_ticks = 10**6
-sim.run(max_ticks=max_ticks)
+
+sim.run()
 print(
     "Exiting @ tick {} because {}.".format(
         sim.get_current_tick(), sim.get_last_exit_event_cause()
diff --git a/tests/gem5/checkpoint_tests/configs/x86-hello-restore-checkpoint.py b/tests/gem5/checkpoint_tests/configs/x86-hello-restore-checkpoint.py
index 5a36a01e35..4e637c7d9f 100644
--- a/tests/gem5/checkpoint_tests/configs/x86-hello-restore-checkpoint.py
+++ b/tests/gem5/checkpoint_tests/configs/x86-hello-restore-checkpoint.py
@@ -61,8 +61,13 @@ board = SimpleBoard(
     cache_hierarchy=cache_hierarchy,
 )
 board.set_se_binary_workload(
-    obtain_resource("x86-hello64-static"),
-    checkpoint=obtain_resource("x86-hello-test-checkpoint"),
+    obtain_resource(
+        "x86-hello64-static",
+        resource_version="1.0.0",
+    ),
+    checkpoint=obtain_resource(
+        "x86-hello-test-checkpoint-v24-0", resource_version="3.0.0"
+    ),
 )
 
 sim = Simulator(board=board, full_system=False)
diff --git a/tests/gem5/checkpoint_tests/configs/x86-hello-save-checkpoint.py b/tests/gem5/checkpoint_tests/configs/x86-hello-save-checkpoint.py
index bc35e581df..fd6126ada7 100644
--- a/tests/gem5/checkpoint_tests/configs/x86-hello-save-checkpoint.py
+++ b/tests/gem5/checkpoint_tests/configs/x86-hello-save-checkpoint.py
@@ -69,11 +69,16 @@ board = SimpleBoard(
     memory=memory,
     cache_hierarchy=cache_hierarchy,
 )
-board.set_se_binary_workload(obtain_resource("x86-hello64-static"))
+board.set_se_binary_workload(
+    obtain_resource(
+        "x86-hello64-static",
+        resource_version="1.0.0",
+    )
+)
 
-sim = Simulator(board=board, full_system=False)
-max_ticks = 10**6
-sim.run(max_ticks=max_ticks)
+sim = Simulator(board=board, full_system=False, max_ticks=10**6)
+
+sim.run()
 print(
     "Exiting @ tick {} because {}.".format(
         sim.get_current_tick(), sim.get_last_exit_event_cause()
diff --git a/tests/gem5/fixture.py b/tests/gem5/fixture.py
index f1fcc38dcc..6db43243cb 100644
--- a/tests/gem5/fixture.py
+++ b/tests/gem5/fixture.py
@@ -171,17 +171,22 @@ class SConsFixture(UniqueFixture):
                 "You may want to use --skip-build, or use 'rerun'."
             )
 
+        # Create the KConfig configuration based on the ISA
+        defconfig_command = [
+            "scons",
+            "-C",
+            self.directory,
+            "--ignore-style",
+            "--no-compress-debug",
+            "defconfig",
+            self.target_dir,
+            joinpath(self.directory, "build_opts", self.isa.upper()),
+        ]
+        log_call(log.test_log, defconfig_command, time=None, stderr=sys.stderr)
+
+        # If there is a cache coherence protocol specified,
+        # set it to the config.
         if self.protocol:
-            defconfig_command = [
-                "scons",
-                "-C",
-                self.directory,
-                "--ignore-style",
-                "--no-compress-debug",
-                "defconfig",
-                self.target_dir,
-                joinpath(self.directory, "build_opts", self.isa.upper()),
-            ]
             setconfig_command = [
                 "scons",
                 "-C",
@@ -192,13 +197,30 @@ class SConsFixture(UniqueFixture):
                 self.target_dir,
                 f"RUBY_PROTOCOL_{self.protocol.upper()}=y",
             ]
-            log_call(
-                log.test_log, defconfig_command, time=None, stderr=sys.stderr
-            )
             log_call(
                 log.test_log, setconfig_command, time=None, stderr=sys.stderr
             )
 
+        # Ensure the test objects are compiled into the binary by
+        # setting it in the config.
+        setconfig_add_test_obj_command = [
+            "scons",
+            "-C",
+            self.directory,
+            "--ignore-style",
+            "--no-compress-debug",
+            "setconfig",
+            self.target_dir,
+            "USE_TEST_OBJECTS=y",
+        ]
+
+        log_call(
+            log.test_log,
+            setconfig_add_test_obj_command,
+            time=None,
+            stderr=sys.stderr,
+        )
+
         command = [
             "scons",
             "-C",
diff --git a/tests/gem5/fs/linux/arm/test.py b/tests/gem5/fs/linux/arm/test.py
index aa5961bcb4..6cceea0e37 100644
--- a/tests/gem5/fs/linux/arm/test.py
+++ b/tests/gem5/fs/linux/arm/test.py
@@ -72,6 +72,10 @@ arm_fs_long_tests = [
     "realview64-minor-dual",
     "realview64-switcheroo-o3",
     "realview64-switcheroo-full",
+    "realview-simple-timing-ruby",
+    "realview64-simple-timing-ruby",
+    "realview64-simple-timing-dual-ruby",
+    "realview64-o3-dual-ruby",
     # The following tests fail. These are recorded in the GEM5-640
     # Jira issue.
     #
@@ -83,14 +87,6 @@ arm_fs_long_tests = [
     #'realview-simple-timing-dual-ruby',
 ]
 
-# These tests are Ruby-based and Ruby does not support multiple ISAs
-arm_fs_long_tests_arm_target = [
-    "realview-simple-timing-ruby",
-    "realview64-simple-timing-ruby",
-    "realview64-simple-timing-dual-ruby",
-    "realview64-o3-dual-ruby",
-]
-
 tarball = "aarch-system-20220707.tar.bz2"
 url = config.resource_url + "/arm/" + tarball
 filepath = os.path.dirname(os.path.abspath(__file__))
@@ -178,29 +174,3 @@ for name in arm_fs_long_tests:
         fixtures=(arm_fs_binaries,),
         uses_kvm=name in arm_fs_kvm_tests,
     )
-
-for name in arm_fs_long_tests_arm_target:
-    args = [
-        joinpath(
-            config.base_dir,
-            "tests",
-            "gem5",
-            "fs",
-            "linux",
-            "arm",
-            "configs",
-            name + ".py",
-        ),
-        path,
-        config.base_dir,
-    ]
-    gem5_verify_config(
-        name=name,
-        verifiers=verifier_list(name),  # TODO: Add basic stat verifiers
-        config=joinpath(filepath, "run.py"),
-        config_args=args,
-        valid_isas=(constants.arm_tag,),
-        length=constants.long_tag,
-        fixtures=(arm_fs_binaries,),
-        uses_kvm=name in arm_fs_kvm_tests,
-    )
diff --git a/tests/gem5/gem5_library_example_tests/ref/simout_multisim_print_this_list.txt b/tests/gem5/gem5_library_example_tests/ref/simout_multisim_print_this_list.txt
new file mode 100644
index 0000000000..1934e3b536
--- /dev/null
+++ b/tests/gem5/gem5_library_example_tests/ref/simout_multisim_print_this_list.txt
@@ -0,0 +1,5 @@
+process_0
+process_1
+process_2
+process_3
+process_4
diff --git a/tests/gem5/gem5_library_example_tests/test_gem5_library_examples.py b/tests/gem5/gem5_library_example_tests/test_gem5_library_examples.py
index 9c88ff4307..f91b4bba15 100644
--- a/tests/gem5/gem5_library_example_tests/test_gem5_library_examples.py
+++ b/tests/gem5/gem5_library_example_tests/test_gem5_library_examples.py
@@ -348,6 +348,86 @@ gem5_verify_config(
     length=constants.long_tag,
 )
 
+gem5_verify_config(
+    name="test-gem5-library-example-multisim-fs-x86-npb",
+    fixtures=(),
+    verifiers=(),
+    config=joinpath(
+        config.base_dir,
+        "configs",
+        "example",
+        "gem5_library",
+        "multisim",
+        "multisim-fs-x86-npb.py",
+    ),
+    config_args=[],
+    gem5_args=["-m", "gem5.utils.multisim"],
+    valid_isas=(constants.all_compiled_tag,),
+    valid_hosts=constants.supported_hosts,
+    length=constants.long_tag,
+)
+
+gem5_verify_config(
+    name="test-gem5-library-example-multisim-print-this",
+    fixtures=(),
+    verifiers=(),
+    config=joinpath(
+        config.base_dir,
+        "configs",
+        "example",
+        "gem5_library",
+        "multisim",
+        "multisim-print-this.py",
+    ),
+    config_args=[],
+    gem5_args=["-m", "gem5.utils.multisim"],
+    valid_isas=(constants.all_compiled_tag,),
+    valid_hosts=constants.supported_hosts,
+    length=constants.quick_tag,
+)
+
+gem5_verify_config(
+    name="test-gem5-library-example-multisim-print-this-list",
+    fixtures=(),
+    verifiers=(
+        verifier.MatchStdoutNoPerf(
+            joinpath(getcwd(), "ref/simout_multisim_print_this_list.txt")
+        ),
+    ),
+    config=joinpath(
+        config.base_dir,
+        "configs",
+        "example",
+        "gem5_library",
+        "multisim",
+        "multisim-print-this.py",
+    ),
+    config_args=["--list"],
+    gem5_args=[],
+    valid_isas=(constants.all_compiled_tag,),
+    valid_hosts=constants.supported_hosts,
+    length=constants.quick_tag,
+)
+
+gem5_verify_config(
+    name="test-gem5-library-example-multisim-print-this-single-process",
+    fixtures=(),
+    verifiers=(),
+    config=joinpath(
+        config.base_dir,
+        "configs",
+        "example",
+        "gem5_library",
+        "multisim",
+        "multisim-print-this.py",
+    ),
+    config_args=["process_1"],
+    gem5_args=[],
+    valid_isas=(constants.all_compiled_tag,),
+    valid_hosts=constants.supported_hosts,
+    length=constants.quick_tag,
+)
+
 # The LoopPoint-Checkpointing feature is still under development, therefore
 # these tests are temporarily disabled until this feature is complete.#
 
diff --git a/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py b/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py
index 3dc6119454..a415dbd458 100644
--- a/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py
+++ b/tests/gem5/kvm_fork_tests/configs/boot_kvm_fork_run.py
@@ -180,10 +180,14 @@ kernel_args = motherboard.get_default_kernel_args() + [args.kernel_args]
 # Set the Full System workload.
 motherboard.set_kernel_disk_workload(
     kernel=obtain_resource(
-        "x86-linux-kernel-5.4.49", resource_directory=args.resource_directory
+        "x86-linux-kernel-5.4.49",
+        resource_directory=args.resource_directory,
+        resource_version="1.0.0",
     ),
     disk_image=obtain_resource(
-        "x86-ubuntu-18.04-img", resource_directory=args.resource_directory
+        "x86-ubuntu-18.04-img",
+        resource_directory=args.resource_directory,
+        resource_version="1.0.0",
     ),
     readfile_contents=dedent(
         """
diff --git a/tests/gem5/kvm_switch_tests/configs/boot_kvm_switch_exit.py b/tests/gem5/kvm_switch_tests/configs/boot_kvm_switch_exit.py
index c5b59b7f4e..5d42a52142 100644
--- a/tests/gem5/kvm_switch_tests/configs/boot_kvm_switch_exit.py
+++ b/tests/gem5/kvm_switch_tests/configs/boot_kvm_switch_exit.py
@@ -166,10 +166,14 @@ kernal_args = motherboard.get_default_kernel_args() + [args.kernel_args]
 # Set the Full System workload.
 motherboard.set_kernel_disk_workload(
     kernel=obtain_resource(
-        "x86-linux-kernel-5.4.49", resource_directory=args.resource_directory
+        "x86-linux-kernel-5.4.49",
+        resource_directory=args.resource_directory,
+        resource_version="1.0.0",
     ),
     disk_image=obtain_resource(
-        "x86-ubuntu-18.04-img", resource_directory=args.resource_directory
+        "x86-ubuntu-18.04-img",
+        resource_directory=args.resource_directory,
+        resource_version="1.0.0",
     ),
     # The first exit signals to switch processors.
     readfile_contents="m5 exit\nm5 exit\n",
diff --git a/tests/gem5/parsec_benchmarks/configs/parsec_disk_run.py b/tests/gem5/parsec_benchmarks/configs/parsec_disk_run.py
index 606205f103..91b30e5fa2 100644
--- a/tests/gem5/parsec_benchmarks/configs/parsec_disk_run.py
+++ b/tests/gem5/parsec_benchmarks/configs/parsec_disk_run.py
@@ -146,11 +146,11 @@ args = parser.parse_args()
 # Setup the cachie hierarchy.
 
 if args.mem_system == "classic":
-    from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-        PrivateL1PrivateL2CacheHierarchy,
+    from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+        PrivateL1PrivateL2WalkCacheHierarchy,
     )
 
-    cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+    cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
         l1d_size="32kB", l1i_size="32kB", l2_size="256kB"
     )
 elif args.mem_system == "mesi_two_level":
@@ -204,10 +204,14 @@ command = (
 
 board.set_kernel_disk_workload(
     kernel=obtain_resource(
-        "x86-linux-kernel-5.4.49", resource_directory=args.resource_directory
+        "x86-linux-kernel-5.4.49",
+        resource_directory=args.resource_directory,
+        resource_version="1.0.0",
     ),
     disk_image=obtain_resource(
-        "x86-parsec", resource_directory=args.resource_directory
+        "x86-parsec",
+        resource_directory=args.resource_directory,
+        resource_version="1.0.0",
     ),
     readfile_contents=command,
 )
diff --git a/tests/gem5/riscv_boot_tests/configs/riscv_boot_exit_run.py b/tests/gem5/riscv_boot_tests/configs/riscv_boot_exit_run.py
index 0192d3dbff..06a9d7e4be 100644
--- a/tests/gem5/riscv_boot_tests/configs/riscv_boot_exit_run.py
+++ b/tests/gem5/riscv_boot_tests/configs/riscv_boot_exit_run.py
@@ -104,12 +104,12 @@ args = parser.parse_args()
 requires(isa_required=ISA.RISCV)
 
 if args.mem_system == "classic":
-    from gem5.components.cachehierarchies.classic.private_l1_private_l2_cache_hierarchy import (
-        PrivateL1PrivateL2CacheHierarchy,
+    from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+        PrivateL1PrivateL2WalkCacheHierarchy,
     )
 
     # Setup the cache hierarchy.
-    cache_hierarchy = PrivateL1PrivateL2CacheHierarchy(
+    cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
         l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
     )
 elif args.mem_system == "mesi_two_level":
@@ -163,7 +163,9 @@ board = RiscvBoard(
 
 # Set the workload.
 workload = obtain_resource(
-    "riscv-ubuntu-20.04-boot", resource_directory=args.resource_directory
+    "riscv-ubuntu-20.04-boot",
+    resource_directory=args.resource_directory,
+    resource_version="3.0.0",
 )
 board.set_workload(workload)
 
@@ -171,9 +173,9 @@ board.set_workload(workload)
 simulator = Simulator(board=board)
 
 if args.tick_exit:
-    simulator.run(max_ticks=args.tick_exit)
-else:
-    simulator.run()
+    simulator.set_max_ticks(args.tick_exit)
+
+simulator.run()
 
 print(
     "Exiting @ tick {} because {}.".format(
diff --git a/tests/gem5/se_mode/hello_se/test_hello_se.py b/tests/gem5/se_mode/hello_se/test_hello_se.py
index 3520f3c7cf..43bcce44fd 100644
--- a/tests/gem5/se_mode/hello_se/test_hello_se.py
+++ b/tests/gem5/se_mode/hello_se/test_hello_se.py
@@ -48,7 +48,6 @@ import re
 from testlib import *
 
 isa_str_map = {
-    constants.gcn3_x86_tag: "x86",
     constants.arm_tag: "arm",
     constants.mips_tag: "mips",
     constants.riscv_tag: "riscv",
diff --git a/tests/gem5/stats/README.md b/tests/gem5/stats/README.md
index c55600bed1..2c721f5a34 100644
--- a/tests/gem5/stats/README.md
+++ b/tests/gem5/stats/README.md
@@ -1,7 +1,11 @@
 # Stats
 
-This test runs an SE simulation with the hdf5 stats and checks that the simulation succeeds and the stats file exists.
-To run these tests by themselves, you can run the following command in the tests directory:
+These test ensure the stats are output correctly.
+
+1. "test_hdf5" - Test hdf5 output. Runs a simulation and ensures the hdf5
+   output exists.
+2. "test_simstats_output" - Tests the SimStat python module is parsing and
+   outputting the stats correctly.
 
 ```bash
 ./main.py run gem5/stats --length=[length]
diff --git a/tests/gem5/stats/configs/pystat_scalar_check.py b/tests/gem5/stats/configs/pystat_scalar_check.py
new file mode 100644
index 0000000000..1fe61b7f62
--- /dev/null
+++ b/tests/gem5/stats/configs/pystat_scalar_check.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import json
+import sys
+
+import m5
+from m5.objects import (
+    Root,
+    ScalarStatTester,
+)
+from m5.stats.gem5stats import get_simstat
+
+"""This script is used for checking that Scaler statistics set in the simulation are
+correctly parsed through to the python Pystats.
+"""
+
+parser = argparse.ArgumentParser(
+    description="Tests the output of a Scaler Pystat."
+)
+
+parser.add_argument(
+    "value", type=float, help="The value of the scalar statistic."
+)
+parser.add_argument(
+    "--name",
+    type=str,
+    default="scalar",
+    required=False,
+    help="The name of the scalar statistic.",
+)
+parser.add_argument(
+    "--description",
+    type=str,
+    default="",
+    required=False,
+    help="The description of the scalar statistic.",
+)
+
+args = parser.parse_args()
+
+stat_tester = ScalarStatTester()
+stat_tester.name = args.name
+stat_tester.description = args.description
+stat_tester.value = args.value
+expected_output = {
+    "type": "SimObject",
+    "name": "system",
+    "time_conversion": None,
+    args.name: {
+        "value": args.value,
+        "type": "Scalar",
+        "unit": "Count",
+        "description": args.description,
+        "datatype": "f64",
+    },
+}
+
+root = Root(full_system=False, system=stat_tester)
+
+m5.instantiate()
+m5.simulate()
+
+simstats = get_simstat(stat_tester)
+output = simstats.to_json()
+
+# Remove the time related fields from the outputs if they exist.
+# `creation_time` is not deterministic, and `simulated_begin_time` and
+# simulated_end_time are not under test here.
+for field in ["creation_time", "simulated_begin_time", "simulated_end_time"]:
+    for map in [output, expected_output]:
+        if field in map:
+            del map[field]
+
+if output != expected_output:
+    print("Output statistics do not match expected:", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Expected:", file=sys.stderr)
+    print(json.dumps(expected_output, indent=4), file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Actual:", file=sys.stderr)
+    print(json.dumps(output, indent=4), file=sys.stderr)
+    sys.exit(1)
diff --git a/tests/gem5/stats/configs/pystat_sparse_dist_check.py b/tests/gem5/stats/configs/pystat_sparse_dist_check.py
new file mode 100644
index 0000000000..2d7bc4b6c1
--- /dev/null
+++ b/tests/gem5/stats/configs/pystat_sparse_dist_check.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import json
+import sys
+
+import m5
+from m5.objects import (
+    Root,
+    SparseHistStatTester,
+)
+from m5.stats.gem5stats import get_simstat
+
+parser = argparse.ArgumentParser(
+    description="Tests the output of a SparseHist Pystat."
+)
+parser.add_argument(
+    "samples",
+    help="delimited list representing the samples for the distributed "
+    "histogram.",
+    type=lambda s: [float(item) for item in s.split(",")],
+)
+
+parser.add_argument(
+    "--name",
+    type=str,
+    default="sparse_hist",
+    required=False,
+    help="The name of the Sparse Histogram statistic.",
+)
+
+parser.add_argument(
+    "--description",
+    type=str,
+    default=None,
+    required=False,
+    help="The description of the Sparse Histogram statistic.",
+)
+
+args = parser.parse_args()
+
+stat_tester = SparseHistStatTester(
+    name=args.name, description=args.description, samples=args.samples
+)
+
+root = Root(full_system=False, system=stat_tester)
+m5.instantiate()
+m5.simulate()
+
+simstats = get_simstat(stat_tester)
+output = simstats.to_json()
+
+
+value_dict = {}
+for sample in args.samples:
+    value_dict[sample] = (
+        1 if sample not in value_dict else value_dict[sample] + 1
+    )
+
+scaler_dict = {}
+for key in value_dict:
+    scaler_dict[key] = {
+        "unit": "Count",
+        "type": "Scalar",
+        "description": None,
+        "value": value_dict[key],
+        "datatype": "f64",
+    }
+
+expected_output = {
+    "type": "SimObject",
+    "name": "system",
+    "time_conversion": None,
+    args.name: {
+        "value": scaler_dict,
+        "type": "SparseHist",
+        "description": str(args.description),
+    },
+}
+
+# Remove the time related fields from the outputs if they exist.
+# `creation_time` is not deterministic, and `simulated_begin_time` and
+# simulated_end_time are not under test here.
+for field in ["creation_time", "simulated_begin_time", "simulated_end_time"]:
+    for map in [output, expected_output]:
+        if field in map:
+            del map[field]
+
+if output != expected_output:
+    print("Output statistics do not match expected:", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Expected:", file=sys.stderr)
+    print(json.dumps(expected_output, indent=4), file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Actual:", file=sys.stderr)
+    print(json.dumps(output, indent=4), file=sys.stderr)
+    sys.exit(1)
diff --git a/tests/gem5/stats/configs/pystat_vector2d_check.py b/tests/gem5/stats/configs/pystat_vector2d_check.py
new file mode 100644
index 0000000000..617463e56f
--- /dev/null
+++ b/tests/gem5/stats/configs/pystat_vector2d_check.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import json
+import sys
+
+import m5
+from m5.objects import (
+    Root,
+    Vector2dStatTester,
+)
+from m5.stats.gem5stats import get_simstat
+
+"""This script is used for checking that Vector2d statistics set in the
+simulation are correctly parsed through to the python Pystats.
+"""
+
+parser = argparse.ArgumentParser(
+    description="Tests the output of a Vector2D Pystat."
+)
+
+parser.add_argument(
+    "value",
+    help="Comma delimited list representing the 2d vector in a flattened "
+    "state.",
+    type=lambda s: [float(item) for item in s.split(",")],
+)
+
+parser.add_argument(
+    "num_vectors",
+    help="The number of vectors in the vector of vectors",
+    type=int,
+)
+
+parser.add_argument(
+    "--name",
+    type=str,
+    default="vector2d",
+    required=False,
+    help="Name of the vector statistic.",
+)
+
+parser.add_argument(
+    "--description",
+    type=str,
+    default="",
+    required=False,
+    help="Description of the vector statistic.",
+)
+
+parser.add_argument(
+    "--subnames",
+    help="delimited list representing the vector subnames",
+    type=str,
+)
+
+parser.add_argument(
+    "--subdescs",
+    help="delimited list representing the vector subdescs",
+    type=str,
+)
+
+parser.add_argument(
+    "--ysubnames",
+    help="delimited list representing the vector ysubnames",
+    type=str,
+)
+
+
+args = parser.parse_args()
+
+expected_output = None
+stat_tester = None
+
+stat_tester = Vector2dStatTester()
+stat_tester.name = args.name
+stat_tester.description = args.description
+stat_tester.subnames = []
+if args.subnames:
+    stat_tester.subnames = [str(item) for item in args.subnames.split(",")]
+
+stat_tester.subdescs = []
+if args.subdescs:
+    stat_tester.subdescs = [str(item) for item in args.subdescs.split(",")]
+
+stat_tester.ysubnames = []
+if args.ysubnames:
+    stat_tester.ysubnames = [str(item) for item in args.ysubnames.split(",")]
+
+assert (
+    len(args.value) % args.num_vectors == 0
+), "The number of values is not divisable by the number of vectors."
+
+stat_tester.x_size = args.num_vectors
+stat_tester.y_size = len(args.value) / args.num_vectors
+stat_tester.values = args.value
+
+vectors = {}  # The representation we expect output.
+for x in range(args.num_vectors):
+    x_index = x if x not in stat_tester.subnames else stat_tester.subnames[x]
+
+    vector = {}
+    for y in range(stat_tester.y_size):
+        to_add = args.value[
+            int(y + (x * (len(args.value) / args.num_vectors)))
+        ]
+        vector[y] = {
+            "value": to_add,
+            "type": "Scalar",
+            "unit": "Count",
+            "description": None,
+            "datatype": "f64",
+        }
+
+    vectors[x_index] = {
+        "type": "Vector",
+        "description": stat_tester.subdescs[x]
+        if x in stat_tester.subdescs
+        else stat_tester.description,
+        "value": vector,
+    }
+
+expected_output = {
+    "type": "SimObject",
+    "name": "system",
+    "time_conversion": None,
+    args.name: {
+        "type": "Vector2d",
+        "value": vectors,
+        "description": args.description,
+    },
+}
+
+root = Root(full_system=False, system=stat_tester)
+
+m5.instantiate()
+m5.simulate()
+
+simstats = get_simstat(stat_tester)
+output = simstats.to_json()
+
+# Remove the time related fields from the outputs if they exist.
+# `creation_time` is not deterministic, and `simulated_begin_time` and
+# simulated_end_time are not under test here.
+for field in ["creation_time", "simulated_begin_time", "simulated_end_time"]:
+    for map in [output, expected_output]:
+        if field in map:
+            del map[field]
+
+if output != expected_output:
+    print("Output statistics do not match expected:", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Expected:", file=sys.stderr)
+    print(json.dumps(expected_output, indent=4), file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Actual:", file=sys.stderr)
+    print(json.dumps(output, indent=4), file=sys.stderr)
+    sys.exit(1)
diff --git a/tests/gem5/stats/configs/pystat_vector_check.py b/tests/gem5/stats/configs/pystat_vector_check.py
new file mode 100644
index 0000000000..e708f0319f
--- /dev/null
+++ b/tests/gem5/stats/configs/pystat_vector_check.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import json
+import sys
+
+import m5
+from m5.objects import (
+    Root,
+    VectorStatTester,
+)
+from m5.stats.gem5stats import get_simstat
+
+"""This script is used for checking that the Vector statistics set in "
+the simulation are correctly parsed through to the python Pystats.
+"""
+
+parser = argparse.ArgumentParser(
+    description="Tests the output of a Vector PyStat."
+)
+
+parser.add_argument(
+    "value",
+    help="Comma delimited list representing the vector.",
+    type=lambda s: [float(item) for item in s.split(",")],
+)
+
+parser.add_argument(
+    "--name",
+    type=str,
+    default="vector",
+    required=False,
+    help="Name of the vector statistic.",
+)
+
+parser.add_argument(
+    "--description",
+    type=str,
+    default="",
+    required=False,
+    help="Description of the vector statistic.",
+)
+
+parser.add_argument(
+    "--subnames",
+    help="Comma delimited list representing the vector subnames.",
+    type=str,
+)
+
+parser.add_argument(
+    "--subdescs",
+    help="Comma delimited list representing the vector subdescs",
+    type=str,
+)
+
+args = parser.parse_args()
+
+stat_tester = VectorStatTester()
+stat_tester.name = args.name
+stat_tester.description = args.description
+stat_tester.values = args.value
+
+stat_tester.subnames = []
+if args.subnames:
+    stat_tester.subnames = [str(item) for item in args.subnames.split(",")]
+
+stat_tester.subdescs = []
+if args.subdescs:
+    stat_tester.subdescs = [str(item) for item in args.subdescs.split(",")]
+
+value_dict = {}
+for i in range(len(args.value)):
+    i_name = i
+    description = args.description
+    if stat_tester.subnames and i < len(stat_tester.subnames):
+        i_name = stat_tester.subnames[i]
+    if stat_tester.subdescs and i < len(stat_tester.subdescs):
+        description = stat_tester.subdescs[i]
+
+    value_dict[i_name] = {
+        "value": args.value[i],
+        "type": "Scalar",
+        "unit": "Count",
+        "description": description,
+        "datatype": "f64",
+    }
+
+expected_output = {
+    "type": "SimObject",
+    "name": "system",
+    "time_conversion": None,
+    args.name: {
+        "value": value_dict,
+        "type": "Vector",
+        "description": args.description,
+    },
+}
+
+root = Root(full_system=False, system=stat_tester)
+
+m5.instantiate()
+m5.simulate()
+
+simstats = get_simstat(stat_tester)
+output = simstats.to_json()
+
+# Remove the time related fields from the outputs if they exist.
+# `creation_time` is not deterministic, and `simulated_begin_time` and
+# simulated_end_time are not under test here.
+for field in ["creation_time", "simulated_begin_time", "simulated_end_time"]:
+    for map in [output, expected_output]:
+        if field in map:
+            del map[field]
+
+if output != expected_output:
+    print("Output statistics do not match expected:", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Expected:", file=sys.stderr)
+    print(json.dumps(expected_output, indent=4), file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Actual:", file=sys.stderr)
+    print(json.dumps(output, indent=4), file=sys.stderr)
+    sys.exit(1)
diff --git a/tests/gem5/stats/configs/pystats_simobjectvector_check.py b/tests/gem5/stats/configs/pystats_simobjectvector_check.py
new file mode 100644
index 0000000000..1393cf98f3
--- /dev/null
+++ b/tests/gem5/stats/configs/pystats_simobjectvector_check.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This script is used for checking that SimObject Vectorsare
+correctly parsed through to the gem5 PyStats."""
+
+import m5
+from m5.objects import (
+    Root,
+    ScalarStatTester,
+    VectorStatTester,
+)
+from m5.stats.gem5stats import get_simstat
+
+root = Root(full_system=False)
+root.stat_testers = [
+    ScalarStatTester(name="placeholder", value=11),
+    ScalarStatTester(
+        name="placeholder", value=22, description="Index 2 desc."
+    ),
+    ScalarStatTester(name="placeholder", value=33),
+    VectorStatTester(
+        name="index_4",
+        values=[44, 55, 66],
+        description="A SimStat Vector within a SimObject Vector.",
+    ),
+]
+
+m5.instantiate()
+m5.simulate()
+
+simstat = get_simstat(root)
+
+# 'stat_testers' is a list of SimObjects
+assert hasattr(simstat, "stat_testers"), "No stat_testers attribute found."
+assert len(simstat.stat_testers) == 4, "stat_testers list is not of length 3."
+
+# Accessable by index.
+simobject = simstat.stat_testers[0]
+
+# We can directly access the statistic we're interested in and its "str"
+# representation should be the same as the value we set. In this case "11.0".
+assert (
+    str(simobject.placeholder) == "11.0"
+), "placeholder value is not 11.0 ()."
+
+# They can also be accessed like so:
+# "other_stat" is a SimObject with a single stat called "stat".
+str(
+    simstat["stat_testers"][3]["index_4"][0]
+) == "44.0", 'simstat[3]["index_4"][0] value is not 44.'
+
+# We can also access other stats like type and description.
+assert simstat.stat_testers[1].placeholder.description == "Index 2 desc."
+assert simstat.stat_testers[1].placeholder.type == "Scalar"
diff --git a/tests/gem5/stats/test_pystat_output.py b/tests/gem5/stats/test_pystat_output.py
new file mode 100644
index 0000000000..619509dc56
--- /dev/null
+++ b/tests/gem5/stats/test_pystat_output.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from testlib import *
+
+gem5_verify_config(
+    name="pystat-scaler-int-test",
+    fixtures=(),
+    verifiers=[],
+    config=joinpath(
+        config.base_dir,
+        "tests",
+        "gem5",
+        "stats",
+        "configs",
+        "pystat_scalar_check.py",
+    ),
+    config_args=[
+        "42",
+        "--name",
+        "scalar_test",
+        "--description",
+        "A scalar statistic with a int value",
+    ],
+    valid_isas=(constants.all_compiled_tag,),
+    length=constants.quick_tag,
+)
+
+gem5_verify_config(
+    name="pystat-scaler-int-zero-test",
+    fixtures=(),
+    verifiers=[],
+    config=joinpath(
+        config.base_dir,
+        "tests",
+        "gem5",
+        "stats",
+        "configs",
+        "pystat_scalar_check.py",
+    ),
+    config_args=[
+        "0",
+    ],
+    valid_isas=(constants.all_compiled_tag,),
+    length=constants.quick_tag,
+)
+
+gem5_verify_config(
+    name="pystat-scaler-int-negative-test",
+    fixtures=(),
+    verifiers=[],
+    config=joinpath(
+        config.base_dir,
+        "tests",
+        "gem5",
+        "stats",
+        "configs",
+        "pystat_scalar_check.py",
+    ),
+    config_args=[
+        "-245",
+    ],
+    valid_isas=(constants.all_compiled_tag,),
+    length=constants.quick_tag,
+)
+
+gem5_verify_config(
+    name="pystat-scaler-float-test",
+    fixtures=(),
+    verifiers=[],
+    config=joinpath(
+        config.base_dir,
+        "tests",
+        "gem5",
+        "stats",
+        "configs",
+        "pystat_scalar_check.py",
+    ),
+    config_args=[
+        "42.869",
+        "--name",
+        "float_test",
+        "--description",
+        "A scalar statistic with a float value",
+    ],
+    valid_isas=(constants.all_compiled_tag,),
+    length=constants.quick_tag,
+)
+
+gem5_verify_config(
+    name="pystat_vector_test",
+    fixtures=(),
+    verifiers=[],
+    config=joinpath(
+        config.base_dir,
+        "tests",
+        "gem5",
+        "stats",
+        "configs",
+        "pystat_vector_check.py",
+    ),
+    config_args=[
+        "2.0,4,5.9,2.3,-8,0,0.0,-8.9",
+        "--name",
+        "vector_stat",
+        "--description",
+        "A vector statistic with a float value",
+    ],
+    valid_isas=(constants.all_compiled_tag,),
+    length=constants.quick_tag,
+)
+
+gem5_verify_config(
+    name="pystat_vector_with_subnames_test",
+    fixtures=(),
+    verifiers=[],
+    config=joinpath(
+        config.base_dir,
+        "tests",
+        "gem5",
+        "stats",
+        "configs",
+        "pystat_vector_check.py",
+    ),
+    config_args=[
+        "2.0,4,3",
+        "--name",
+        "vector_stat",
+        "--description",
+        "A vector statistic with a float value",
+        "--subnames",
+        "first,second,third",
+    ],
+    valid_isas=(constants.all_compiled_tag,),
+    length=constants.quick_tag,
+)
+
+gem5_verify_config(
+    name="pystat_vector_with_subdescs_test",
+    fixtures=(),
+    verifiers=[],
+    config=joinpath(
+        config.base_dir,
+        "tests",
+        "gem5",
+        "stats",
+        "configs",
+        "pystat_vector_check.py",
+    ),
+    config_args=[
+        "2.0,4,3",
+        "--name",
+        "vector_stat",
+        "--description",
+        "A vector statistic with a float value",
+        "--subdescs",
+        "first,second",
+    ],
+    valid_isas=(constants.all_compiled_tag,),
+    length=constants.quick_tag,
+)
+
+gem5_verify_config(
+    name="pystat_vector2d_test",
+    fixtures=(),
+    verifiers=[],
+    config=joinpath(
+        config.base_dir,
+        "tests",
+        "gem5",
+        "stats",
+        "configs",
+        "pystat_vector2d_check.py",
+    ),
+    config_args=[
+        "2.4,4.3,3.7,-1.4,-2,4,0,0",
+        2,
+        "--name",
+        "vector2d_stat",
+        "--description",
+        "A 2d vector statistic with",
+        "--subnames",
+        "decimals,integers",
+        "--subdescs",
+        "A random collection of decimals,A random collection of integers",
+        "--ysubnames",
+        "first,second,third,fourth",
+    ],
+    valid_isas=(constants.all_compiled_tag,),
+    length=constants.quick_tag,
+)
+
+gem5_verify_config(
+    name="pystat-sparsehist-test",
+    fixtures=(),
+    verifiers=[],
+    config=joinpath(
+        config.base_dir,
+        "tests",
+        "gem5",
+        "stats",
+        "configs",
+        "pystat_sparse_dist_check.py",
+    ),
+    config_args=[
+        "1.0,1,1.00,23,23,0.2,0.2,0.2,0.2,-1,-1.0,264",
+        "--name",
+        "sparsehist_stat",
+        "--description",
+        "A sparse histogram statistic.",
+    ],
+    valid_isas=(constants.all_compiled_tag,),
+    length=constants.quick_tag,
+)
+
+gem5_verify_config(
+    name="simstat-simobjectvector-test",
+    fixtures=(),
+    verifiers=[],
+    config=joinpath(
+        config.base_dir,
+        "tests",
+        "gem5",
+        "stats",
+        "configs",
+        "pystats_simobjectvector_check.py",
+    ),
+    config_args=[],
+    valid_isas=(constants.all_compiled_tag,),
+    length=constants.quick_tag,
+)
diff --git a/tests/gem5/stdlib/configs/simulator_exit_event_run.py b/tests/gem5/stdlib/configs/simulator_exit_event_run.py
index 6b5e06890f..9338e1233b 100644
--- a/tests/gem5/stdlib/configs/simulator_exit_event_run.py
+++ b/tests/gem5/stdlib/configs/simulator_exit_event_run.py
@@ -103,7 +103,9 @@ motherboard = SimpleBoard(
 # Note: Here we're using the "x86-m5-exit-repeat" resource. This calls an
 # `m5_exit(0)` command in an infinite while-loop.
 binary = obtain_resource(
-    "x86-m5-exit-repeat", resource_directory=args.resource_directory
+    "x86-m5-exit-repeat",
+    resource_directory=args.resource_directory,
+    resource_version="1.0.0",
 )
 motherboard.set_se_binary_workload(binary)
 
diff --git a/tests/gem5/suite_tests/README.md b/tests/gem5/suite_tests/README.md
new file mode 100644
index 0000000000..06afca6fc9
--- /dev/null
+++ b/tests/gem5/suite_tests/README.md
@@ -0,0 +1,8 @@
+# Suite Tests
+
+These tests runs one workload from every suite in gem5 resources to 10 billion ticks.
+This test makes sure that the workload is loaded correctly.
+To run these tests by themselves, you can run the following command in the tests directory:
+```
+./main.py run gem5/suite_tests
+```
diff --git a/tests/gem5/suite_tests/configs/suite_run_workload.py b/tests/gem5/suite_tests/configs/suite_run_workload.py
new file mode 100644
index 0000000000..9b8629fa32
--- /dev/null
+++ b/tests/gem5/suite_tests/configs/suite_run_workload.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+This example runs the first workload of the given suite.
+
+Characteristics
+---------------
+* User needs to specify the isa in lower case.
+"""
+import argparse
+
+from m5.util import panic
+
+from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
+    PrivateL1PrivateL2WalkCacheHierarchy,
+)
+from gem5.components.memory import SingleChannelDDR3_1600
+from gem5.components.processors.cpu_types import CPUTypes
+from gem5.components.processors.simple_processor import SimpleProcessor
+from gem5.isas import ISA
+from gem5.resources.resource import obtain_resource
+from gem5.simulate.simulator import Simulator
+from gem5.utils.requires import requires
+
+parser = argparse.ArgumentParser(description="A script to run suite tests.")
+
+parser.add_argument(
+    "-i", "--suite-id", type=str, required=True, help="The suite id."
+)
+
+parser.add_argument(
+    "-v", "--version", type=str, required=False, help="The suite version."
+)
+
+parser.add_argument(
+    "-t",
+    "--tick-exit",
+    type=int,
+    required=False,
+    help="The tick to exit the simulation.",
+)
+
+parser.add_argument(
+    "-r",
+    "--resource-directory",
+    type=str,
+    required=False,
+    help="The directory in which resources will be downloaded or exist.",
+)
+
+parser.add_argument(
+    "-s",
+    "--isa",
+    type=str,
+    required=True,
+    help="The ISA to use.",
+)
+
+parser.add_argument(
+    "-f",
+    "--fs-sim",
+    default=False,
+    action="store_true",
+    required=False,
+    help="Whether to run the simulation as full system.",
+)
+
+args = parser.parse_args()
+
+# Setup the cache hierarchy.
+cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
+    l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
+)
+
+# Setup the system memory.
+memory = SingleChannelDDR3_1600(size="3GB")
+
+
+def get_processor(isa):
+    processor = SimpleProcessor(cpu_type=CPUTypes.TIMING, isa=isa, num_cores=1)
+    return processor
+
+
+if args.isa == "riscv":
+    requires(isa_required=ISA.RISCV)
+
+    from gem5.prebuilt.riscvmatched.riscvmatched_board import RISCVMatchedBoard
+
+    board = RISCVMatchedBoard()
+
+elif args.isa == "x86":
+    requires(isa_required=ISA.X86)
+
+    from gem5.components.boards.x86_board import X86Board
+
+    processor = get_processor(ISA.X86)
+    board = X86Board(
+        clk_freq="1GHz",
+        processor=processor,
+        memory=memory,
+        cache_hierarchy=cache_hierarchy,
+    )
+else:
+    panic(f"ISA {args.isa} does not have a suite.")
+
+# Set the workload.
+if args.version:
+    suite = obtain_resource(
+        args.suite_id,
+        resource_version=args.version,
+        resource_directory=args.resource_directory,
+    )
+else:
+    suite = obtain_resource(
+        args.suite_id, resource_directory=args.resource_directory
+    )
+
+board.set_workload(list(suite)[0])
+
+
+simulator = Simulator(board=board, full_system=args.fs_sim)
+
+if args.tick_exit:
+    simulator.set_max_ticks(args.tick_exit)
+
+simulator.run()
+
+print(
+    "Exiting @ tick {} because {}.".format(
+        simulator.get_current_tick(), simulator.get_last_exit_event_cause()
+    )
+)
diff --git a/tests/gem5/suite_tests/test_suite.py b/tests/gem5/suite_tests/test_suite.py
new file mode 100644
index 0000000000..04cca71669
--- /dev/null
+++ b/tests/gem5/suite_tests/test_suite.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import re
+from typing import (
+    List,
+    Optional,
+    Tuple,
+)
+
+from testlib import *
+
+if config.bin_path:
+    resource_path = config.bin_path
+else:
+    resource_path = os.path.join(
+        joinpath(absdirpath(__file__), "..", "resources")
+    )
+
+
+def test_suite(
+    id: str,
+    isa: str,
+    is_fs: bool,
+    version: Optional[str] = None,
+    to_tick: Optional[int] = None,
+):
+    name = f"suite-{id}_{isa}_suite_test"
+
+    verifiers = []
+
+    config_args = [
+        "--suite-id",
+        id,
+        "--isa",
+        isa,
+        "--resource-directory",
+        resource_path,
+    ]
+
+    if is_fs:
+        config_args.append("--fs-sim")
+    if version:
+        config_args.extend(["--version", version])
+        name = f"suite-{id}_{version}_{isa}_suite_test"
+
+    if to_tick:
+        name += "_to-tick"
+        exit_regex = re.compile(
+            f"Exiting @ tick {str(to_tick)} because simulate\\(\\) limit reached"
+        )
+        verifiers.append(verifier.MatchRegex(exit_regex))
+        config_args += ["--tick-exit", str(to_tick)]
+        gem5_verify_config(
+            name=name,
+            fixtures=(),
+            verifiers=verifiers,
+            config=joinpath(
+                config.base_dir,
+                "tests",
+                "gem5",
+                "suite_tests",
+                "configs",
+                "suite_run_workload.py",
+            ),
+            config_args=config_args,
+            valid_isas=(constants.all_compiled_tag,),
+            valid_hosts=constants.supported_hosts,
+        )
+
+
+test_suite(
+    id="riscv-vertical-microbenchmarks",
+    isa="riscv",
+    to_tick=10000000000,
+    is_fs=False,
+    version="1.0.0",
+)
+
+test_suite(
+    id="npb-benchmark-suite",
+    isa="x86",
+    to_tick=10000000000,
+    is_fs=True,
+    version="1.0.0",
+)
+
+test_suite(
+    id="gapbs-benchmark-suite",
+    isa="x86",
+    to_tick=10000000000,
+    is_fs=True,
+    version="1.0.0",
+)
diff --git a/tests/gem5/to_tick/configs/tick-exit.py b/tests/gem5/to_tick/configs/tick-exit.py
index b1da983abd..e2b6d6809f 100644
--- a/tests/gem5/to_tick/configs/tick-exit.py
+++ b/tests/gem5/to_tick/configs/tick-exit.py
@@ -77,7 +77,9 @@ motherboard = SimpleBoard(
 
 # Set the workload
 binary = obtain_resource(
-    "x86-hello64-static", resource_directory=args.resource_directory
+    "x86-hello64-static",
+    resource_directory=args.resource_directory,
+    resource_version="1.0.0",
 )
 motherboard.set_se_binary_workload(binary)
 
diff --git a/tests/gem5/to_tick/configs/tick-to-max.py b/tests/gem5/to_tick/configs/tick-to-max.py
index 25e0014211..9d7727a39b 100644
--- a/tests/gem5/to_tick/configs/tick-to-max.py
+++ b/tests/gem5/to_tick/configs/tick-to-max.py
@@ -98,7 +98,9 @@ motherboard = SimpleBoard(
 
 # Set the workload
 binary = obtain_resource(
-    "x86-hello64-static", resource_directory=args.resource_directory
+    "x86-hello64-static",
+    resource_directory=args.resource_directory,
+    resource_version="1.0.0",
 )
 motherboard.set_se_binary_workload(binary)
 
@@ -110,9 +112,9 @@ if args.set_ticks_before:
 simulator = Simulator(board=motherboard)
 
 if args.set_ticks_at_execution:
-    simulator.run(max_ticks=args.set_ticks_at_execution)
-else:
-    simulator.run()
+    simulator.set_max_ticks(args.set_ticks_at_execution)
+
+simulator.run()
 
 # Set the max ticks after the simulator run.
 if args.set_ticks_after:
diff --git a/tests/gem5/x86_boot_tests/configs/x86_boot_exit_run.py b/tests/gem5/x86_boot_tests/configs/x86_boot_exit_run.py
index 3a91c4d253..af3a7e4443 100644
--- a/tests/gem5/x86_boot_tests/configs/x86_boot_exit_run.py
+++ b/tests/gem5/x86_boot_tests/configs/x86_boot_exit_run.py
@@ -184,7 +184,9 @@ if args.boot_type == "init":
 
 # Set the workload.
 workload = obtain_resource(
-    "x86-ubuntu-18.04-boot", resource_directory=args.resource_directory
+    "x86-ubuntu-18.04-boot",
+    resource_directory=args.resource_directory,
+    resource_version="2.0.0",
 )
 workload.set_parameter("kernel_args", kernal_args)
 motherboard.set_workload(workload)
@@ -199,9 +201,9 @@ print("Beginning simulation!")
 simulator = Simulator(board=motherboard)
 
 if args.tick_exit:
-    simulator.run(max_ticks=args.tick_exit)
-else:
-    simulator.run()
+    simulator.set_max_ticks(args.tick_exit)
+
+simulator.run()
 
 print(
     "Exiting @ tick {} because {}.".format(
diff --git a/tests/pyunit/pyunit_jsonserializable_check.py b/tests/pyunit/pystats/pyunit_jsonserializable_check.py
similarity index 100%
rename from tests/pyunit/pyunit_jsonserializable_check.py
rename to tests/pyunit/pystats/pyunit_jsonserializable_check.py
diff --git a/tests/pyunit/pystats/pyunit_pystats.py b/tests/pyunit/pystats/pyunit_pystats.py
new file mode 100644
index 0000000000..70839c245b
--- /dev/null
+++ b/tests/pyunit/pystats/pyunit_pystats.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2024 The Regents of The University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import unittest
+from datetime import datetime
+
+from m5.ext.pystats import (
+    Distribution,
+    Scalar,
+    SimObjectGroup,
+    SimObjectVectorGroup,
+    SimStat,
+    SparseHist,
+    Vector,
+    Vector2d,
+)
+
+
+def _get_mock_simstat() -> SimStat:
+    """Used to create a mock SimStat for testing.
+    This SimStat is contains all simstat Statistic types and attempts to use
+    most of the different types of values that can be stored in a Statistic.
+    """
+    simobject_vector_group = SimObjectVectorGroup(
+        value=[
+            SimObjectGroup(
+                **{
+                    "vector2d": Vector2d(
+                        value={
+                            0: Vector(
+                                value={
+                                    "a": Scalar(value=1, description="one"),
+                                    "b": Scalar(value=2.0, description="two"),
+                                    "c": Scalar(value=-3, description="three"),
+                                }
+                            ),
+                            1: Vector(
+                                value={
+                                    1: Scalar(value=4),
+                                    0.2: Scalar(value=5.0),
+                                    0.3: Scalar(value=6),
+                                },
+                                description="vector 1",
+                            ),
+                        },
+                        description="vector 2d",
+                    ),
+                }
+            ),
+            SimObjectGroup(
+                **{
+                    "distribution": Distribution(
+                        value={
+                            0: Scalar(1),
+                            1: Scalar(2),
+                            2: Scalar(3),
+                            3: Scalar(4),
+                            4: Scalar(5),
+                        },
+                        min=0,
+                        max=4,
+                        num_bins=5,
+                        bin_size=1,
+                    ),
+                    "sparse_hist": SparseHist(
+                        value={
+                            0.5: Scalar(4),
+                            0.51: Scalar(1),
+                            0.511: Scalar(4),
+                            5: Scalar(2),
+                        },
+                        description="sparse hist",
+                    ),
+                },
+            ),
+        ],
+    )
+
+    return SimStat(
+        creation_time=datetime.fromisoformat("2021-01-01T00:00:00"),
+        time_conversion=None,
+        simulated_begin_time=123,
+        simulated_end_time=558644,
+        simobject_vector=simobject_vector_group,
+    )
+
+
+class NavigatingPyStatsTestCase(unittest.TestCase):
+    """A test case for navigating the PyStats data structure, primarily
+    on how to access children of a SimStat object, and the "find" methods to
+    search for a specific statistic.
+    """
+
+    def setUp(self) -> None:
+        """Overrides the setUp method to create a mock SimStat for testing.
+        Runs before each test method.
+        """
+        self.failFast = True
+        self.simstat = _get_mock_simstat()
+        super().setUp()
+
+    def test_simstat_index(self):
+        self.assertTrue("simobject_vector" in self.simstat)
+        self.assertIsInstance(
+            self.simstat["simobject_vector"], SimObjectVectorGroup
+        )
+
+    def test_simstat_attribute(self):
+        self.assertTrue(hasattr(self.simstat, "simobject_vector"))
+        self.assertIsInstance(
+            self.simstat.simobject_vector, SimObjectVectorGroup
+        )
+
+    def test_simobject_vector_attribute(self):
+        # To maintan compatibility with the old way of accessing the vector,
+        # the simobject vectors values can be accessed by attributes of that
+        # simoobject vector name and the index appended to it.
+        # E.g., `simstat.simobject_vector0 is the same
+        # is simstat.simobject_vector[0]`. In cases where there is already
+        # an attribute with the same name as the vector+index, the attribute
+        # will be returned.
+        self.assertEqual(
+            self.simstat.simobject_vector0, self.simstat.simobject_vector[0]
+        )
+
+    def test_simobject_vector_index(self):
+        self.assertTrue(self.simstat.simobject_vector[0], SimObjectGroup)
+
+    def test_simobject_group_index(self):
+        self.assertTrue("vector2d" in self.simstat.simobject_vector[0])
+        self.assertIsInstance(
+            self.simstat.simobject_vector[0]["vector2d"], Vector2d
+        )
+
+    def test_simobject_group_attribute(self):
+        self.assertTrue(hasattr(self.simstat.simobject_vector[0], "vector2d"))
+        self.assertIsInstance(
+            self.simstat.simobject_vector[0].vector2d, Vector2d
+        )
+
+    def test_vector2d_index(self):
+        self.assertEqual(2, len(self.simstat.simobject_vector[0]["vector2d"]))
+        self.assertTrue(0 in self.simstat.simobject_vector[0].vector2d)
+        self.assertIsInstance(
+            self.simstat.simobject_vector[0].vector2d[0], Vector
+        )
+
+    def test_vector_index_int(self):
+        self.assertEqual(3, len(self.simstat.simobject_vector[0].vector2d[1]))
+        self.assertTrue(1 in self.simstat.simobject_vector[0].vector2d[1])
+        self.assertIsInstance(
+            self.simstat.simobject_vector[0].vector2d[1][1], Scalar
+        )
+
+    def test_vector_index_str(self):
+        self.assertEqual(3, len(self.simstat.simobject_vector[0].vector2d[0]))
+        self.assertTrue("a" in self.simstat.simobject_vector[0].vector2d[0])
+        self.assertIsInstance(
+            self.simstat.simobject_vector[0].vector2d[0]["a"], Scalar
+        )
+
+    def test_vector_index_float(self):
+        self.assertEqual(3, len(self.simstat.simobject_vector[0].vector2d[1]))
+        self.assertTrue(0.2 in self.simstat.simobject_vector[0].vector2d[1])
+        self.assertIsInstance(
+            self.simstat.simobject_vector[0].vector2d[1][0.2], Scalar
+        )
+
+    def test_distriibution_index(self):
+        self.assertTrue(0 in self.simstat.simobject_vector[1]["distribution"])
+        self.assertIsInstance(
+            self.simstat.simobject_vector[1]["distribution"][0], Scalar
+        )
+
+    def test_sparse_hist_index(self):
+        self.assertTrue(0.5 in self.simstat.simobject_vector[1]["sparse_hist"])
+        self.assertIsInstance(
+            self.simstat.simobject_vector[1]["sparse_hist"][0.5], Scalar
+        )
+
+    def test_pystat_find(self):
+        self.assertEqual(
+            self.simstat.find("sparse_hist"),
+            [self.simstat.simobject_vector[1]["sparse_hist"]],
+        )
diff --git a/tests/pyunit/stdlib/resources/pyunit_client_wrapper_checks.py b/tests/pyunit/stdlib/resources/pyunit_client_wrapper_checks.py
index f98005e546..ee739a423d 100644
--- a/tests/pyunit/stdlib/resources/pyunit_client_wrapper_checks.py
+++ b/tests/pyunit/stdlib/resources/pyunit_client_wrapper_checks.py
@@ -32,11 +32,13 @@ from pathlib import Path
 from unittest.mock import patch
 from urllib.error import HTTPError
 
-from gem5.resources.client import get_resource_json_obj
+from gem5.resources.client import (
+    _create_clients,
+    get_resource_json_obj,
+)
 from gem5.resources.client_api.atlasclient import (
     AtlasClientHttpJsonRequestError,
 )
-from gem5.resources.client_api.client_wrapper import ClientWrapper
 
 mock_json_path = Path(__file__).parent / "refs/resources.json"
 mock_config_json = {
@@ -76,7 +78,7 @@ with open(Path(__file__).parent / "refs/mongo-dup-mock.json") as f:
     duplicate_mock_json = json.load(f)
 
 
-def mocked_requests_post(*args):
+def mocked_requests_post(*args, **kwargs):
     # mokcing urllib.request.urlopen
     class MockResponse:
         def __init__(self, json_data, status_code):
@@ -91,21 +93,36 @@ def mocked_requests_post(*args):
         return MockResponse({"access_token": "test-token"}, 200)
     if "/endpoint/data/v1/action/find" in args[0].full_url:
         if data:
-            if data["filter"]["id"] == "x86-ubuntu-18.04-img":
+            if data["filter"]["$or"][0]["id"] == "x86-ubuntu-18.04-img":
+                if "resource_version" in data["filter"]["$or"][0]:
+                    resource_version = data["filter"]["$or"][0][
+                        "resource_version"
+                    ]
+                    ret_json = [
+                        resource
+                        for resource in mock_json
+                        if resource["resource_version"] == resource_version
+                    ]
+                    return MockResponse(
+                        {
+                            "documents": ret_json,
+                        },
+                        200,
+                    )
                 return MockResponse(
                     {
                         "documents": mock_json,
                     },
                     200,
                 )
-            if data["filter"]["id"] == "test-duplicate":
+            if data["filter"]["$or"][0]["id"] == "test-duplicate":
                 return MockResponse(
                     {
                         "documents": duplicate_mock_json,
                     },
                     200,
                 )
-            if data["filter"]["id"] == "test-too-many":
+            if data["filter"]["$or"][0]["id"] == "test-too-many":
                 error_file = io.BytesIO()
                 error_file.status = 429
                 raise HTTPError(
@@ -125,9 +142,13 @@ def mocked_requests_post(*args):
 class ClientWrapperTestSuite(unittest.TestCase):
     @patch(
         "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_json),
+        new=None,
     )
-    def test_get_resource_json_obj(self):
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def test_get_resource_json_obj(self, mock_create_clients):
         # Test that the resource object is correctly returned
         resource = "this-is-a-test-resource"
         resource = get_resource_json_obj(resource, gem5_version="develop")
@@ -145,9 +166,13 @@ class ClientWrapperTestSuite(unittest.TestCase):
 
     @patch(
         "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_json),
+        new=None,
     )
-    def test_get_resource_json_obj_invalid_client(self):
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def test_get_resource_json_obj_invalid_client(self, mock_create_clients):
         # Test that an exception is raised when an invalid client is passed
         resource_id = "test-id"
         client = "invalid"
@@ -161,9 +186,13 @@ class ClientWrapperTestSuite(unittest.TestCase):
 
     @patch(
         "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_json),
+        new=None,
     )
-    def test_get_resource_json_obj_with_version(self):
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def test_get_resource_json_obj_with_version(self, mock_create_clients):
         # Test that the resource object is correctly returned
         resource_id = "this-is-a-test-resource"
         resource_version = "1.0.0"
@@ -184,10 +213,14 @@ class ClientWrapperTestSuite(unittest.TestCase):
 
     @patch(
         "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_mongo),
+        new=None,
+    )
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_mongo),
     )
     @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
-    def test_get_resource_json_obj_1(self, mock_get):
+    def test_get_resource_json_obj_1(self, mock_get, mock_create_clients):
         resource = "x86-ubuntu-18.04-img"
         resource = get_resource_json_obj(resource, gem5_version="develop")
         self.assertEqual(resource["id"], "x86-ubuntu-18.04-img")
@@ -206,10 +239,16 @@ class ClientWrapperTestSuite(unittest.TestCase):
 
     @patch(
         "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_mongo),
+        new=None,
+    )
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_mongo),
     )
     @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
-    def test_get_resource_json_obj_with_version_mongodb(self, mock_get):
+    def test_get_resource_json_obj_with_version_mongodb(
+        self, mock_get, mock_create_clients
+    ):
         # Test that the resource object is correctly returned
         resource_id = "x86-ubuntu-18.04-img"
         resource_version = "1.0.0"
@@ -230,11 +269,13 @@ class ClientWrapperTestSuite(unittest.TestCase):
         self.assertEqual(resource["architecture"], "X86")
 
     @patch(
-        "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_mongo),
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_mongo),
     )
     @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
-    def test_get_resource_json_obj_with_id_invalid_mongodb(self, mock_get):
+    def test_get_resource_json_obj_with_id_invalid_mongodb(
+        self, mock_get, mock_create_clients
+    ):
         resource_id = "invalid-id"
         with self.assertRaises(Exception) as context:
             get_resource_json_obj(
@@ -246,12 +287,12 @@ class ClientWrapperTestSuite(unittest.TestCase):
         )
 
     @patch(
-        "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_mongo),
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_mongo),
     )
     @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
     def test_get_resource_json_obj_with_version_invalid_mongodb(
-        self, mock_get
+        self, mock_get, mock_create_clients
     ):
         resource_id = "x86-ubuntu-18.04-img"
         resource_version = "2.5.0"
@@ -262,18 +303,19 @@ class ClientWrapperTestSuite(unittest.TestCase):
                 clients=["gem5-resources"],
                 gem5_version="develop",
             )
+        print(str(context.exception))
         self.assertTrue(
-            f"Resource x86-ubuntu-18.04-img with version '2.5.0'"
-            " not found.\nResource versions can be found at: "
-            "https://resources.gem5.org/resources/x86-ubuntu-18.04-img/"
-            "versions" in str(context.exception)
+            "Resource with ID 'x86-ubuntu-18.04-img' not found."
+            in str(context.exception)
         )
 
     @patch(
         "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_json),
+        side_effect=lambda x: _create_clients(mock_config_json),
     )
-    def test_get_resource_json_obj_with_version_invalid_json(self):
+    def test_get_resource_json_obj_with_version_invalid_json(
+        self, mock_create_clients
+    ):
         resource_id = "this-is-a-test-resource"
         resource_version = "2.5.0"
         with self.assertRaises(Exception) as context:
@@ -282,19 +324,20 @@ class ClientWrapperTestSuite(unittest.TestCase):
                 resource_version=resource_version,
                 gem5_version="develop",
             )
+        print(str(context.exception))
         self.assertTrue(
-            "Resource this-is-a-test-resource with version '2.5.0'"
-            " not found.\nResource versions can be found at: "
-            "https://resources.gem5.org/resources/this-is-a-test-resource/"
-            "versions" in str(context.exception)
+            "source with ID 'this-is-a-test-resource' not found."
+            in str(context.exception)
         )
 
     @patch(
-        "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_combined),
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_combined),
     )
     @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
-    def test_get_resource_json_obj_combine(self, mock_get):
+    def test_get_resource_json_obj_combine(
+        self, mock_get, mock_create_clients
+    ):
         resource_id_mongo = "x86-ubuntu-18.04-img"
         resource_version_mongo = "1.0.0"
         resource_id_json = "this-is-a-test-resource"
@@ -337,15 +380,17 @@ class ClientWrapperTestSuite(unittest.TestCase):
         self.assertEqual(resource_json["architecture"], "X86")
 
     @patch(
-        "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_combined),
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_combined),
     )
     @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
-    def test_get_resource_json_obj_multi_database_second_only(self, mock_get):
+    def test_get_resource_json_obj_multi_database_second_only(
+        self, mock_get, mock_create_clients
+    ):
         resource_id = "simpoint-resource"
         resource = get_resource_json_obj(
             resource_id,
-            gem5_version="develop",
+            gem5_version="DEVELOP",
         )
         self.assertEqual(resource["id"], resource_id)
         self.assertEqual(resource["resource_version"], "0.2.0")
@@ -360,12 +405,12 @@ class ClientWrapperTestSuite(unittest.TestCase):
         )
 
     @patch(
-        "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_combined),
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_combined),
     )
     @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
     def test_get_resource_json_same_resource_different_versions(
-        self, mock_get
+        self, mock_get, mock_create_clients
     ):
         resource_id = "x86-ubuntu-18.04-img"
         resource_json = get_resource_json_obj(
@@ -386,11 +431,13 @@ class ClientWrapperTestSuite(unittest.TestCase):
         self.assertEqual(resource_json["category"], "disk-image")
 
     @patch(
-        "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_combined),
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_combined),
     )
     @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
-    def test_get_resource_same_resource_same_version(self, mock_get):
+    def test_get_resource_same_resource_same_version(
+        self, mock_get, mock_create_clients
+    ):
         resource_id = "test-duplicate"
         with self.assertRaises(Exception) as context:
             get_resource_json_obj(
@@ -403,8 +450,8 @@ class ClientWrapperTestSuite(unittest.TestCase):
         )
 
     @patch(
-        "gem5.resources.client.clientwrapper",
-        ClientWrapper(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(
             {
                 "sources": {
                     "gem5-resources": {
@@ -421,7 +468,20 @@ class ClientWrapperTestSuite(unittest.TestCase):
         ),
     )
     @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
-    def test_invalid_auth_url(self, mock_get):
+    def test_invalid_auth_url(self, mock_get, mock_create_clients):
+        resource_id = "test-resource"
+        with self.assertRaises(Exception) as context:
+            get_resource_json_obj(
+                resource_id,
+                gem5_version="develop",
+            )
+
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_mongo),
+    )
+    @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
+    def test_invalid_url(self, mock_get, mock_create_clients):
         resource_id = "test-resource"
         with self.assertRaises(AtlasClientHttpJsonRequestError) as context:
             get_resource_json_obj(
@@ -430,24 +490,11 @@ class ClientWrapperTestSuite(unittest.TestCase):
             )
 
     @patch(
-        "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_mongo),
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_mongo),
     )
     @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
-    def test_invalid_url(self, mock_get):
-        resource_id = "test-resource"
-        with self.assertRaises(AtlasClientHttpJsonRequestError) as context:
-            get_resource_json_obj(
-                resource_id,
-                gem5_version="develop",
-            )
-
-    @patch(
-        "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_mongo),
-    )
-    @patch("urllib.request.urlopen", side_effect=mocked_requests_post)
-    def test_invalid_url(self, mock_get):
+    def test_invalid_url(self, mock_get, mock_create_clients):
         resource_id = "test-too-many"
         with self.assertRaises(AtlasClientHttpJsonRequestError) as context:
             get_resource_json_obj(
diff --git a/tests/pyunit/stdlib/resources/pyunit_obtain_resources_check.py b/tests/pyunit/stdlib/resources/pyunit_obtain_resources_check.py
index b4bc5dba94..ad01d391f1 100644
--- a/tests/pyunit/stdlib/resources/pyunit_obtain_resources_check.py
+++ b/tests/pyunit/stdlib/resources/pyunit_obtain_resources_check.py
@@ -34,7 +34,7 @@ from unittest.mock import patch
 from _m5 import core
 
 from gem5.isas import ISA
-from gem5.resources.client_api.client_wrapper import ClientWrapper
+from gem5.resources.client import _create_clients
 from gem5.resources.resource import (
     BinaryResource,
     obtain_resource,
@@ -54,7 +54,11 @@ mock_config_json = {
 
 @patch(
     "gem5.resources.client.clientwrapper",
-    new=ClientWrapper(mock_config_json),
+    new=None,
+)
+@patch(
+    "gem5.resources.client._create_clients",
+    side_effect=lambda x: _create_clients(mock_config_json),
 )
 class TestObtainResourcesCheck(unittest.TestCase):
     def get_resource_dir(cls) -> str:
@@ -71,7 +75,7 @@ class TestObtainResourcesCheck(unittest.TestCase):
             "resources",
         )
 
-    def test_obtain_resources_no_version(self):
+    def test_obtain_resources_no_version(self, mock_create_client):
         """Test that the resource loader returns latest version compatible with that version of gem5 when no version is specified."""
         gem5Version = core.gem5Version
         resource = obtain_resource(
@@ -85,7 +89,9 @@ class TestObtainResourcesCheck(unittest.TestCase):
         self.assertEqual("src/test-source", resource.get_source())
         self.assertEqual(ISA.ARM, resource.get_architecture())
 
-    def test_obtain_resources_with_version_compatible(self):
+    def test_obtain_resources_with_version_compatible(
+        self, mock_create_client
+    ):
         resource = obtain_resource(
             resource_id="test-binary-resource",
             resource_directory=self.get_resource_dir(),
@@ -100,7 +106,9 @@ class TestObtainResourcesCheck(unittest.TestCase):
         self.assertEqual("src/test-source", resource.get_source())
         self.assertEqual(ISA.ARM, resource.get_architecture())
 
-    def test_obtain_resources_with_version_incompatible(self):
+    def test_obtain_resources_with_version_incompatible(
+        self, mock_create_client
+    ):
         resource = None
         f = io.StringIO()
         with contextlib.redirect_stderr(f):
@@ -124,7 +132,7 @@ class TestObtainResourcesCheck(unittest.TestCase):
         self.assertEqual("src/test-source", resource.get_source())
         self.assertEqual(ISA.ARM, resource.get_architecture())
 
-    def test_obtain_resources_no_version_invalid_id(self):
+    def test_obtain_resources_no_version_invalid_id(self, mock_create_client):
         with self.assertRaises(Exception) as context:
             obtain_resource(
                 resource_id="invalid-id",
@@ -136,7 +144,9 @@ class TestObtainResourcesCheck(unittest.TestCase):
             in str(context.exception)
         )
 
-    def test_obtain_resources_with_version_invalid_id(self):
+    def test_obtain_resources_with_version_invalid_id(
+        self, mock_create_client
+    ):
         with self.assertRaises(Exception) as context:
             obtain_resource(
                 resource_id="invalid-id",
@@ -149,7 +159,9 @@ class TestObtainResourcesCheck(unittest.TestCase):
             in str(context.exception)
         )
 
-    def test_obtain_resources_with_version_invalid_version(self):
+    def test_obtain_resources_with_version_invalid_version(
+        self, mock_create_client
+    ):
         with self.assertRaises(Exception) as context:
             obtain_resource(
                 resource_id="test-binary-resource",
@@ -157,8 +169,6 @@ class TestObtainResourcesCheck(unittest.TestCase):
                 resource_version="3.0.0",
             )
         self.assertTrue(
-            f"Resource test-binary-resource with version '3.0.0'"
-            " not found.\nResource versions can be found at: "
-            f"https://resources.gem5.org/resources/test-binary-resource/versions"
+            "Resource with ID 'test-binary-resource' not found."
             in str(context.exception)
         )
diff --git a/tests/pyunit/stdlib/resources/pyunit_resource_specialization.py b/tests/pyunit/stdlib/resources/pyunit_resource_specialization.py
index 2401edbc3e..7c6ffd95f9 100644
--- a/tests/pyunit/stdlib/resources/pyunit_resource_specialization.py
+++ b/tests/pyunit/stdlib/resources/pyunit_resource_specialization.py
@@ -30,7 +30,10 @@ from pathlib import Path
 from unittest.mock import patch
 
 from gem5.isas import ISA
-from gem5.resources.client_api.client_wrapper import ClientWrapper
+from gem5.resources.client import (
+    _create_clients,
+    clientwrapper,
+)
 from gem5.resources.looppoint import (
     LooppointCsvLoader,
     LooppointJsonLoader,
@@ -51,7 +54,11 @@ mock_config_json = {
 
 @patch(
     "gem5.resources.client.clientwrapper",
-    ClientWrapper(mock_config_json),
+    new=None,
+)
+@patch(
+    "gem5.resources.client._create_clients",
+    side_effect=lambda x: _create_clients(mock_config_json),
 )
 class ResourceSpecializationSuite(unittest.TestCase):
     """This suite tests that `gem5.resource.resource` casts to the correct
@@ -73,7 +80,7 @@ class ResourceSpecializationSuite(unittest.TestCase):
             "resources",
         )
 
-    def test_binary_resource(self) -> None:
+    def test_binary_resource(self, mock_create_clients) -> None:
         """Tests the loading of of a BinaryResource"""
         resource = obtain_resource(
             resource_id="binary-example",
@@ -89,7 +96,7 @@ class ResourceSpecializationSuite(unittest.TestCase):
         self.assertEqual("src/simple", resource.get_source())
         self.assertEqual(ISA.ARM, resource.get_architecture())
 
-    def test_kernel_resource(self) -> None:
+    def test_kernel_resource(self, mock_create_clients) -> None:
         """Tests the loading of a KernelResource."""
         resource = obtain_resource(
             resource_id="kernel-example",
@@ -105,7 +112,7 @@ class ResourceSpecializationSuite(unittest.TestCase):
         self.assertEqual("src/linux-kernel", resource.get_source())
         self.assertEqual(ISA.RISCV, resource.get_architecture())
 
-    def test_bootloader_resource(self) -> None:
+    def test_bootloader_resource(self, mock_create_clients) -> None:
         """Tests the loading of a BootloaderResource."""
         resource = obtain_resource(
             resource_id="bootloader-example",
@@ -121,7 +128,7 @@ class ResourceSpecializationSuite(unittest.TestCase):
         self.assertIsNone(resource.get_source())
         self.assertIsNone(resource.get_architecture())
 
-    def test_disk_image_resource(self) -> None:
+    def test_disk_image_resource(self, mock_create_clients) -> None:
         """Tests the loading of a DiskImageResource."""
         resource = obtain_resource(
             resource_id="disk-image-example",
@@ -137,7 +144,7 @@ class ResourceSpecializationSuite(unittest.TestCase):
         self.assertEqual("src/x86-ubuntu", resource.get_source())
         self.assertEqual("1", resource.get_root_partition())
 
-    def test_checkpoint_resource(self) -> None:
+    def test_checkpoint_resource(self, mock_create_clients) -> None:
         """Tests the loading of a CheckpointResource."""
         resource = obtain_resource(
             resource_id="checkpoint-example",
@@ -152,7 +159,7 @@ class ResourceSpecializationSuite(unittest.TestCase):
         )
         self.assertIsNone(resource.get_source())
 
-    def test_git_resource(self) -> None:
+    def test_git_resource(self, mock_create_clients) -> None:
         """Tests the loading of a GitResource."""
         resource = obtain_resource(
             resource_id="git-example",
@@ -165,7 +172,7 @@ class ResourceSpecializationSuite(unittest.TestCase):
         self.assertIsNone(resource.get_description())
         self.assertIsNone(resource.get_source())
 
-    def test_simpoint_directory_resource(self) -> None:
+    def test_simpoint_directory_resource(self, mock_create_clients) -> None:
         """Tests the loading of a Simpoint directory resource."""
         resource = obtain_resource(
             resource_id="simpoint-directory-example",
@@ -200,7 +207,7 @@ class ResourceSpecializationSuite(unittest.TestCase):
         )
         self.assertEqual("Example Workload", resource.get_workload_name())
 
-    def test_simpoint_resource(self) -> None:
+    def test_simpoint_resource(self, mock_create_clients) -> None:
         """Tests the loading of a Simpoint resource."""
         resource = obtain_resource(
             resource_id="simpoint-example",
@@ -219,7 +226,7 @@ class ResourceSpecializationSuite(unittest.TestCase):
         self.assertEqual([2, 3, 4, 15], resource.get_simpoint_list())
         self.assertEqual([0.1, 0.2, 0.4, 0.3], resource.get_weight_list())
 
-    def test_file_resource(self) -> None:
+    def test_file_resource(self, mock_create_clients) -> None:
         """Tests the loading of a FileResource."""
         resource = obtain_resource(
             resource_id="file-example",
@@ -233,7 +240,7 @@ class ResourceSpecializationSuite(unittest.TestCase):
         self.assertIsNone(resource.get_description())
         self.assertIsNone(resource.get_source())
 
-    def test_directory_resource(self) -> None:
+    def test_directory_resource(self, mock_create_clients) -> None:
         """Tests the loading of a DirectoryResource."""
         resource = obtain_resource(
             resource_id="directory-example",
@@ -247,7 +254,7 @@ class ResourceSpecializationSuite(unittest.TestCase):
         )
         self.assertIsNone(resource.get_source())
 
-    def test_looppoint_pinpoints_resource(self) -> None:
+    def test_looppoint_pinpoints_resource(self, mock_create_clients) -> None:
         """Tests the creation of LooppointCreatorCSVResource via a Looppoint
         pinpoints csv file."""
 
@@ -268,7 +275,9 @@ class ResourceSpecializationSuite(unittest.TestCase):
         )
         self.assertIsNone(resource.get_source())
 
-    def test_looppoint_json_restore_resource(self) -> None:
+    def test_looppoint_json_restore_resource(
+        self, mock_create_clients
+    ) -> None:
         """Tests the creation of LooppointJsonResource via a
         Looppoint JSON file."""
 
diff --git a/tests/pyunit/stdlib/resources/pyunit_suite_checks.py b/tests/pyunit/stdlib/resources/pyunit_suite_checks.py
index 79944a59fd..96ca82b91f 100644
--- a/tests/pyunit/stdlib/resources/pyunit_suite_checks.py
+++ b/tests/pyunit/stdlib/resources/pyunit_suite_checks.py
@@ -33,7 +33,7 @@ import unittest
 from pathlib import Path
 from unittest.mock import patch
 
-from gem5.resources.client_api.client_wrapper import ClientWrapper
+from gem5.resources.client import _create_clients
 from gem5.resources.resource import (
     SuiteResource,
     WorkloadResource,
@@ -54,9 +54,13 @@ class CustomSuiteResourceTestSuite(unittest.TestCase):
     @classmethod
     @patch(
         "gem5.resources.client.clientwrapper",
-        new=ClientWrapper(mock_config_json),
+        new=None,
     )
-    def setUpClass(cls):
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def setUpClass(cls, mock_create_clients):
         cls.workload1 = obtain_resource("simple-workload-1")
         cls.workload2 = obtain_resource("simple-workload-2")
         cls.SuiteResource = SuiteResource(
@@ -65,9 +69,13 @@ class CustomSuiteResourceTestSuite(unittest.TestCase):
 
     @patch(
         "gem5.resources.client.clientwrapper",
-        new=ClientWrapper(mock_config_json),
+        new=None,
     )
-    def test_with_input_group(self) -> None:
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def test_with_input_group(self, mock_create_clients) -> None:
         """
         Tests the `with_input_group` function.
         """
@@ -94,16 +102,24 @@ class SuiteResourceTestSuite(unittest.TestCase):
     @classmethod
     @patch(
         "gem5.resources.client.clientwrapper",
-        new=ClientWrapper(mock_config_json),
+        new=None,
     )
-    def setUpClass(cls):
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def setUpClass(cls, mock_create_clients):
         cls.suite = obtain_resource("suite-example", gem5_version="develop")
 
     @patch(
         "gem5.resources.client.clientwrapper",
-        new=ClientWrapper(mock_config_json),
+        new=None,
     )
-    def test_with_input_group(self) -> None:
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def test_with_input_group(self, mock_create_clients) -> None:
         """
         Tests the `with_input_group` function.
         """
@@ -116,9 +132,13 @@ class SuiteResourceTestSuite(unittest.TestCase):
 
     @patch(
         "gem5.resources.client.clientwrapper",
-        new=ClientWrapper(mock_config_json),
+        new=None,
     )
-    def test_with_input_group_multiple(self) -> None:
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def test_with_input_group_multiple(self, mock_create_clients) -> None:
         # test if an input group can return multiple workloads in a suite resource
         filtered_suite = self.suite.with_input_group("testtag1")
         self.assertIsInstance(filtered_suite, SuiteResource)
@@ -128,9 +148,13 @@ class SuiteResourceTestSuite(unittest.TestCase):
 
     @patch(
         "gem5.resources.client.clientwrapper",
-        new=ClientWrapper(mock_config_json),
+        new=None,
     )
-    def test_with_input_group_invalid(self) -> None:
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def test_with_input_group_invalid(self, mock_create_clients) -> None:
         """
         Tests the `with_input_group` function with an invalid input group.
         """
@@ -145,9 +169,13 @@ class SuiteResourceTestSuite(unittest.TestCase):
 
     @patch(
         "gem5.resources.client.clientwrapper",
-        new=ClientWrapper(mock_config_json),
+        new=None,
     )
-    def test_get_input_groups(self) -> None:
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def test_get_input_groups(self, mock_create_clients) -> None:
         """
         Tests the `list_input_groups` function.
         """
@@ -156,9 +184,13 @@ class SuiteResourceTestSuite(unittest.TestCase):
 
     @patch(
         "gem5.resources.client.clientwrapper",
-        new=ClientWrapper(mock_config_json),
+        new=None,
     )
-    def test_get_input_groups_not_found(self) -> None:
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def test_get_input_groups_not_found(self, mock_create_clients) -> None:
         """
         Tests the `list_input_groups` function with an invalid input group.
         """
diff --git a/tests/pyunit/stdlib/resources/pyunit_workload_checks.py b/tests/pyunit/stdlib/resources/pyunit_workload_checks.py
index f5917208d5..1942e989aa 100644
--- a/tests/pyunit/stdlib/resources/pyunit_workload_checks.py
+++ b/tests/pyunit/stdlib/resources/pyunit_workload_checks.py
@@ -30,7 +30,10 @@ from pathlib import Path
 from typing import Dict
 from unittest.mock import patch
 
-from gem5.resources.client_api.client_wrapper import ClientWrapper
+from gem5.resources.client import (
+    _create_clients,
+    clientwrapper,
+)
 from gem5.resources.resource import (
     BinaryResource,
     DiskImageResource,
@@ -60,9 +63,13 @@ class CustomWorkloadTestSuite(unittest.TestCase):
     @classmethod
     @patch(
         "gem5.resources.client.clientwrapper",
-        new=ClientWrapper(mock_config_json),
+        new=None,
     )
-    def setUpClass(cls) -> None:
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def setUpClass(cls, mock_create_client) -> None:
         cls.custom_workload = WorkloadResource(
             function="set_se_binary_workload",
             parameters={
@@ -134,9 +141,13 @@ class WorkloadTestSuite(unittest.TestCase):
     @classmethod
     @patch(
         "gem5.resources.client.clientwrapper",
-        ClientWrapper(mock_config_json),
+        new=None,
     )
-    def setUpClass(cls):
+    @patch(
+        "gem5.resources.client._create_clients",
+        side_effect=lambda x: _create_clients(mock_config_json),
+    )
+    def setUpClass(cls, mock_create_client):
         cls.workload = obtain_resource("simple-boot", gem5_version="develop")
 
     def test_get_function_str(self) -> None:
diff --git a/tests/pyunit/stdlib/resources/refs/mongo-mock.json b/tests/pyunit/stdlib/resources/refs/mongo-mock.json
index e2fb058ff7..f351907074 100644
--- a/tests/pyunit/stdlib/resources/refs/mongo-mock.json
+++ b/tests/pyunit/stdlib/resources/refs/mongo-mock.json
@@ -22,7 +22,8 @@
         "source_url": "https://github.com/gem5/gem5-resources/tree/develop/src/x86-ubuntu",
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "example_usage": "get_resource(resource_name=\"x86-ubuntu-18.04-img\")"
     },
@@ -49,7 +50,8 @@
         "source_url": "https://github.com/gem5/gem5-resources/tree/develop/src/x86-ubuntu",
         "resource_version": "1.1.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "example_usage": "get_resource(resource_name=\"x86-ubuntu-18.04-img\")"
     }
diff --git a/tests/pyunit/stdlib/resources/refs/obtain-resource.json b/tests/pyunit/stdlib/resources/refs/obtain-resource.json
index 9125bf4ae6..08aa448bb4 100644
--- a/tests/pyunit/stdlib/resources/refs/obtain-resource.json
+++ b/tests/pyunit/stdlib/resources/refs/obtain-resource.json
@@ -10,7 +10,8 @@
         "source": "src/test-source",
         "resource_version": "2.5.0",
         "gem5_versions": [
-            "25.0"
+            "25.0",
+            "24.0"
         ]
     },
     {
@@ -24,7 +25,8 @@
         "source": "src/test-source",
         "resource_version": "2.0.0",
         "gem5_versions": [
-            "23.0"
+            "23.0",
+            "24.0"
         ]
     },
     {
@@ -39,7 +41,8 @@
         "resource_version": "1.7.0",
         "gem5_versions": [
             "develop",
-            "develop-2"
+            "develop-2",
+            "24.0"
         ]
     },
     {
@@ -53,7 +56,8 @@
         "source": "src/test-source",
         "resource_version": "1.5.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ]
     }
 ]
diff --git a/tests/pyunit/stdlib/resources/refs/resource-specialization.json b/tests/pyunit/stdlib/resources/refs/resource-specialization.json
index 9a92204765..aa6bac16f1 100644
--- a/tests/pyunit/stdlib/resources/refs/resource-specialization.json
+++ b/tests/pyunit/stdlib/resources/refs/resource-specialization.json
@@ -114,7 +114,8 @@
         "resource_version": "1.0.0",
         "gem5_versions": [
             "develop",
-            "23.0"
+            "23.0",
+            "24.0"
         ]
     },
     {
diff --git a/tests/pyunit/stdlib/resources/refs/resources.json b/tests/pyunit/stdlib/resources/refs/resources.json
index 56930f37d5..40d484c9a8 100644
--- a/tests/pyunit/stdlib/resources/refs/resources.json
+++ b/tests/pyunit/stdlib/resources/refs/resources.json
@@ -21,7 +21,8 @@
         "source_url": "https://github.com/gem5/gem5-resources/tree/develop/src/asmtest",
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "example_usage": "get_resource(resource_name=\"rv64mi-p-sbreak\")"
     },
@@ -48,7 +49,8 @@
         "source_url": "https://github.com/gem5/gem5-resources/tree/develop/src/asmtest",
         "resource_version": "1.1.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "example_usage": "get_resource(resource_name=\"rv64mi-p-sbreak\")"
     },
@@ -94,7 +96,8 @@
         "source_url": "",
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "workload_name": "x86-print-this-15000-with-simpoints",
         "example_usage": "get_resource(resource_name=\"x86-print-this-1500-simpoints\")",
@@ -122,7 +125,8 @@
         "source_url": "",
         "resource_version": "0.2.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "workload_name": "x86-print-this-15000-with-simpoints",
         "example_usage": "get_resource(resource_name=\"x86-print-this-1500-simpoints\")",
@@ -150,7 +154,8 @@
         "source_url": "",
         "resource_version": "0.2.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "workload_name": "x86-print-this-15000-with-simpoints",
         "example_usage": "get_resource(resource_name=\"x86-print-this-1500-simpoints\")",
@@ -178,7 +183,8 @@
         "source_url": "",
         "resource_version": "0.2.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "workload_name": "x86-print-this-15000-with-simpoints",
         "example_usage": "get_resource(resource_name=\"x86-print-this-1500-simpoints\")",
@@ -206,7 +212,8 @@
         "source_url": "",
         "resource_version": "0.2.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "workload_name": "x86-print-this-15000-with-simpoints",
         "example_usage": "get_resource(resource_name=\"x86-print-this-1500-simpoints\")",
@@ -234,7 +241,8 @@
         "source_url": "",
         "resource_version": "0.2.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "workload_name": "x86-print-this-15000-with-simpoints",
         "example_usage": "get_resource(resource_name=\"x86-print-this-1500-simpoints\")",
@@ -262,7 +270,8 @@
         "source_url": "",
         "resource_version": "0.2.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "workload_name": "x86-print-this-15000-with-simpoints",
         "example_usage": "get_resource(resource_name=\"x86-print-this-1500-simpoints\")",
@@ -290,7 +299,8 @@
         "source_url": "",
         "resource_version": "0.2.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "workload_name": "x86-print-this-15000-with-simpoints",
         "example_usage": "get_resource(resource_name=\"x86-print-this-1500-simpoints\")",
@@ -322,7 +332,8 @@
         "source_url": "https://github.com/gem5/gem5-resources/tree/develop/src/x86-ubuntu",
         "resource_version": "2.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ],
         "example_usage": "get_resource(resource_name=\"x86-ubuntu-18.04-img\")"
     }
diff --git a/tests/pyunit/stdlib/resources/refs/suite-checks.json b/tests/pyunit/stdlib/resources/refs/suite-checks.json
index 7583020292..0e4a413c59 100644
--- a/tests/pyunit/stdlib/resources/refs/suite-checks.json
+++ b/tests/pyunit/stdlib/resources/refs/suite-checks.json
@@ -3,7 +3,7 @@
         "id": "suite-example",
         "category": "suite",
         "resource_version": "1.0.0",
-        "gem5_versions": ["develop","23.1"],
+        "gem5_versions": ["develop","23.1", "24.0"],
         "workloads": [
             {
                 "id": "simple-workload-1",
@@ -23,15 +23,22 @@
         "description": "Description of workload here",
         "function": "set_kernel_disk_workload",
         "resources": {
-            "kernel": "x86-linux-kernel-5.2.3-example",
-            "disk-image": "x86-ubuntu-18.04-img-example"
+            "kernel": {
+                "id": "x86-linux-kernel-5.2.3-example",
+                "resource_version": "1.0.0"
+            },
+            "disk-image": {
+                "id": "x86-ubuntu-18.04-img-example",
+                "resource_version": "1.0.0"
+            }
         },
         "additional_params": {
             "readfile_contents": "echo 'Boot successful'; m5 exit"
         },
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ]
     },
     {
@@ -40,15 +47,22 @@
         "description": "Description of workload here",
         "function": "set_kernel_disk_workload",
         "resources": {
-            "kernel": "x86-linux-kernel-5.2.3-example",
-            "disk-image": "x86-ubuntu-18.04-img-example"
+            "kernel": {
+                "id": "x86-linux-kernel-5.2.3-example",
+                "resource_version": "1.0.0"
+            },
+            "disk-image": {
+                "id": "x86-ubuntu-18.04-img-example",
+                "resource_version": "1.0.0"
+            }
         },
         "additional_params": {
             "readfile_contents": "echo 'Boot successful'; m5 exit"
         },
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ]
     },
     {
@@ -62,7 +76,8 @@
         "source": "src/linux-kernel",
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ]
     },
     {
@@ -77,7 +92,8 @@
         "root_partition": "1",
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ]
     },
     {
@@ -91,7 +107,8 @@
         "source": "src/simple",
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ]
     }
 ]
diff --git a/tests/pyunit/stdlib/resources/refs/workload-checks.json b/tests/pyunit/stdlib/resources/refs/workload-checks.json
index bf954059c5..099005bdf5 100644
--- a/tests/pyunit/stdlib/resources/refs/workload-checks.json
+++ b/tests/pyunit/stdlib/resources/refs/workload-checks.json
@@ -10,7 +10,8 @@
         "source": "src/linux-kernel",
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ]
     },
     {
@@ -25,7 +26,8 @@
         "root_partition": "1",
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ]
     },
     {
@@ -34,15 +36,22 @@
         "description": "Description of workload here",
         "function": "set_kernel_disk_workload",
         "resources": {
-            "kernel": "x86-linux-kernel-5.2.3-example",
-            "disk-image": "x86-ubuntu-18.04-img-example"
+            "kernel": {
+                "id": "x86-linux-kernel-5.2.3-example",
+                "resource_version": "1.0.0"
+            },
+            "disk-image": {
+                "id": "x86-ubuntu-18.04-img-example",
+                "resource_version": "1.0.0"
+            }
         },
         "additional_params": {
             "readfile_contents": "echo 'Boot successful'; m5 exit"
         },
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ]
     },
     {
@@ -56,7 +65,8 @@
         "source": "src/simple",
         "resource_version": "1.0.0",
         "gem5_versions": [
-            "develop"
+            "develop",
+            "24.0"
         ]
     }
 ]
diff --git a/tests/run_pyunit.py b/tests/run_pyunit.py
index 6d8a5201b9..89c922e100 100644
--- a/tests/run_pyunit.py
+++ b/tests/run_pyunit.py
@@ -35,6 +35,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import argparse
 import sys
 
 if __name__ == "__main__":
@@ -44,8 +45,20 @@ if __name__ == "__main__":
 if __name__ == "__m5_main__":
     import unittest
 
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--directory",
+        default="pyunit",
+        help="directory to search for pyunit files",
+    )
+
+    args = parser.parse_args()
+
     loader = unittest.TestLoader()
-    tests = loader.discover("pyunit", pattern="pyunit*.py")
+    tests = loader.discover(args.directory, pattern="pyunit*.py")
 
     runner = unittest.runner.TextTestRunner(verbosity=2)
     result = runner.run(tests)
diff --git a/util/cpt_upgraders/riscv-dyn-vlen.py b/util/cpt_upgraders/riscv-dyn-vlen.py
index ea2de9d19d..bac38173e5 100644
--- a/util/cpt_upgraders/riscv-dyn-vlen.py
+++ b/util/cpt_upgraders/riscv-dyn-vlen.py
@@ -1,3 +1,15 @@
+# Copyright (c) 2024 Arm Limited
+# All rights reserved
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
 # Copyright (c) 2023 Barcelona Supercomputing Center (BSC)
 # All rights reserved.
 
@@ -35,12 +47,14 @@ def upgrader(cpt):
     VecRegContainer is always MaxVecLenInBytes.
     """
 
-    for sec in cpt.sections():
-        import re
+    import re
 
+    for sec in cpt.sections():
         # Search for all XC sections
 
-        if re.search(r".*processor.*\.core.*\.xc.*", sec):
+        res = re.search(r"(.*processor.*\.core.*)\.xc.*", sec)
+        if res and cpt.get(res.groups()[0] + ".isa", "isaName") == "riscv":
+            # Only update for RISCV XCs
             # Updating RVV vector registers (dummy values)
             mr = cpt.get(sec, "regs.vector").split()
             if len(mr) != 327680:
diff --git a/util/cpt_upgraders/riscv-pcstate.py b/util/cpt_upgraders/riscv-pcstate.py
index 4182355419..e2bc951941 100644
--- a/util/cpt_upgraders/riscv-pcstate.py
+++ b/util/cpt_upgraders/riscv-pcstate.py
@@ -1,3 +1,15 @@
+# Copyright (c) 2024 Arm Limited
+# All rights reserved
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
 # Copyright (c) 2023 Google LLC
 # All rights reserved.
 
@@ -29,10 +41,12 @@ def upgrader(cpt):
     # Update the RISC-V pcstate to match the new version of
     # PCState
 
-    for sec in cpt.sections():
-        import re
+    import re
 
-        if re.search(r".*processor.*\.core.*\.xc.*", sec):
+    for sec in cpt.sections():
+        res = re.search(r"(.*processor.*\.core.*)\.xc.*", sec)
+        if res and cpt.get(res.groups()[0] + ".isa", "isaName") == "riscv":
+            # Only update for RISCV XCs
             if cpt.get(sec, "_rvType", fallback="") == "":
                 cpt.set(sec, "_rvType", "1")
 
diff --git a/util/cpt_upgraders/riscv-vext.py b/util/cpt_upgraders/riscv-vext.py
index d335f74b83..7012a4b3f6 100644
--- a/util/cpt_upgraders/riscv-vext.py
+++ b/util/cpt_upgraders/riscv-vext.py
@@ -1,3 +1,15 @@
+# Copyright (c) 2024 Arm Limited
+# All rights reserved
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
 # Copyright (c) 2023 Barcelona Supercomputing Center (BSC)
 # All rights reserved.
 
@@ -35,11 +47,13 @@ def upgrader(cpt):
     3) Add RVV misc registers in the checkpoint
     """
 
-    for sec in cpt.sections():
-        import re
+    import re
 
+    for sec in cpt.sections():
         # Search for all XC sections
-        if re.search(r".*processor.*\.core.*\.xc.*", sec):
+        res = re.search(r"(.*processor.*\.core.*)\.xc.*", sec)
+        if res and cpt.get(res.groups()[0] + ".isa", "isaName") == "riscv":
+            # Only update for RISCV XCs
             # Updating RVV vector registers (dummy values)
             # Assuming VLEN = 256 bits (32 bytes)
             mr = cpt.get(sec, "regs.vector").split()
@@ -56,7 +70,10 @@ def upgrader(cpt):
             cpt.set(sec, "regs.matrix", "")
 
         # Search for all ISA sections
-        if re.search(r".*processor.*\.core.*\.isa$", sec):
+        if (
+            re.search(r".*processor.*\.core.*\.isa$", sec)
+            and cpt.get(sec, "isaName") == "riscv"
+        ):
             # Updating RVV misc registers (dummy values)
             mr = cpt.get(sec, "miscRegFile").split()
             if len(mr) == 164:
diff --git a/src/arch/amdgpu/gcn3/SConscript b/util/cpt_upgraders/x86-add-xcr0.py
similarity index 62%
rename from src/arch/amdgpu/gcn3/SConscript
rename to util/cpt_upgraders/x86-add-xcr0.py
index 732d526c7c..0e860013c6 100644
--- a/src/arch/amdgpu/gcn3/SConscript
+++ b/util/cpt_upgraders/x86-add-xcr0.py
@@ -1,6 +1,4 @@
-# -*- mode:python -*-
-
-# Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -29,19 +27,28 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
-import sys
 
-Import('*')
+def upgrader(cpt):
+    """
+    Update the checkpoint to include the XCR0 register if X86 checkpoint.
+    The value is set to the default of 1.
+    """
 
-if not env['CONF']['BUILD_GPU']:
-    Return()
+    import re
 
-if env['CONF']['TARGET_GPU_ISA'] == 'gcn3':
-    Source('decoder.cc')
-    Source('insts/gpu_static_inst.cc')
-    Source('insts/instructions.cc')
-    Source('insts/op_encodings.cc')
-    Source('isa.cc')
-    Source('registers.cc')
+    for sec in cpt.sections():
+        if re.search(r".*sys.*\.cpu.*\.isa$", sec):
+            if cpt.get(sec, "isaName") == "x86":
+                regVals = cpt.get(sec, "regVal")
 
-    DebugFlag('GCN3', 'Debug flag for GCN3 GPU ISA')
+                # Add the default value of XCR0 (1) if missing
+                regVals = f"{regVals} 1"
+                cpt.set(sec, "regVal", regVals)
+        elif re.search(rf"board\.processor\.cores.*isa$", sec):
+            # ISA name doesn't appear to be anywhere in the checkpoint.
+            # Assume it is X86.
+            regVals = cpt.get(sec, "regVal")
+
+            # Add the default value of XCR0 (1) if missing
+            regVals = f"{regVals} 1"
+            cpt.set(sec, "regVal", regVals)
diff --git a/util/cxx_config/Makefile b/util/cxx_config/Makefile
index 455d0317b1..fa6ea5b81f 100644
--- a/util/cxx_config/Makefile
+++ b/util/cxx_config/Makefile
@@ -33,7 +33,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-ARCH = ARM
+ARCH ?= ARM
 VARIANT = opt
 
 CXXFLAGS = -I../../build/$(ARCH) -L../../build/$(ARCH) -DTRACING_ON=1
diff --git a/util/dockerfiles/devcontainer/Dockerfile b/util/dockerfiles/devcontainer/Dockerfile
new file mode 100644
index 0000000000..4c9ae7861b
--- /dev/null
+++ b/util/dockerfiles/devcontainer/Dockerfile
@@ -0,0 +1,53 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This Dockerfile creates a devcontainer for gem5 development referenced in
+# ./devcontainer/devcontainer.json. The devcontainer is a multi-platform
+# container that can be built for x86, arm, and riscv and includes all
+# dependencies required for gem5 development.
+#
+# As this is a multi-platform image, we must build it using the
+# "docker-bake.hcl" specification via `docker buildx`. To do so, execute the
+# following:
+#
+# `docker buildx bake devcontainer --push`
+
+# Stage 1: We build the stable version of gem5.
+# In the final stage this this is copied into /usr/local/bin/gem5. This ensures
+# there is a pre-built gem5 binary in each devcontainer. This can save time
+# if the container is used for education or demonstration purposes.
+FROM --platform=${BUILDPLATFORM} ghcr.io/gem5/ubuntu-24.04_all-dependencies:v24-0 as builder
+RUN git clone --branch v24.0 https://github.com/gem5/gem5
+WORKDIR /gem5
+RUN scons build/ALL/gem5.opt -j`nproc`
+
+# Stage 2: The final stage where we create the devcontainer image.
+# This includes all dependencies for building and running gem5, the
+# dependencies for developing gem5, and a pre-built gem5 binary.
+FROM --platform=${BUILDPLATFORM} ghcr.io/gem5/ubuntu-24.04_all-dependencies:v24-0
+RUN apt -y update  && \
+    apt -y install vim
+COPY --from=builder /gem5/build/ALL/gem5.opt /usr/local/bin/gem5
diff --git a/util/dockerfiles/docker-bake.hcl b/util/dockerfiles/docker-bake.hcl
index 8218290bd2..7092d9cdb1 100644
--- a/util/dockerfiles/docker-bake.hcl
+++ b/util/dockerfiles/docker-bake.hcl
@@ -44,7 +44,7 @@ group "default" {
 }
 
 group "ubuntu-releases" {
-  targets=["ubuntu-22-04_all-dependencies", "ubuntu-20-04_all-dependencies", "ubuntu-22-04_min-dependencies"]
+  targets=["ubuntu-24-04_all-dependencies", "ubuntu-22-04_all-dependencies", "ubuntu-20-04_all-dependencies", "ubuntu-24-04_min-dependencies"]
 }
 
 group "clang-compilers" {
@@ -59,7 +59,7 @@ group "gcc-compilers" {
 target "common" {
   # Here we are enabling multi-platform builds. We are compiling to both ARM
   # amd X86.
-  platforms = ["linux/amd64", "linux/arm64"]
+  platforms = ["linux/amd64", "linux/arm64", "linux/riscv64"]
 }
 
 target "gcn-gpu" {
@@ -90,6 +90,13 @@ target "systemc" {
   tags = ["${IMAGE_URI}/systemc-env:${TAG}"]
 }
 
+target "ubuntu-24-04_all-dependencies" {
+  inherits = ["common"]
+  dockerfile = "Dockerfile"
+  context = "ubuntu-24.04_all-dependencies"
+  tags = ["${IMAGE_URI}/ubuntu-24.04_all-dependencies:${TAG}"]
+}
+
 target "ubuntu-22-04_all-dependencies" {
   inherits = ["common"]
   dockerfile = "Dockerfile"
@@ -97,6 +104,14 @@ target "ubuntu-22-04_all-dependencies" {
   tags = ["${IMAGE_URI}/ubuntu-22.04_all-dependencies:${TAG}"]
 }
 
+target "devcontainer" {
+  inherits = ["common"]
+  dependencies = ["devcontainer"]
+  dockerfile = "Dockerfile"
+  context = "devcontainer"
+  tags = ["${IMAGE_URI}/devcontainer:${TAG}"]
+}
+
 target "ubuntu-20-04_all-dependencies" {
   inherits = ["common"]
   dockerfile = "Dockerfile"
@@ -104,11 +119,11 @@ target "ubuntu-20-04_all-dependencies" {
   tags = ["${IMAGE_URI}/ubuntu-20.04_all-dependencies:${TAG}"]
 }
 
-target "ubuntu-22-04_min-dependencies" {
+target "ubuntu-24-04_min-dependencies" {
   inherits = ["common"]
   dockerfile = "Dockerfile"
-  context = "ubuntu-22.04_min-dependencies"
-  tags = ["${IMAGE_URI}/ubuntu-22.04_min-dependencies:${TAG}"]
+  context = "ubuntu-24.04_min-dependencies"
+  tags = ["${IMAGE_URI}/ubuntu-24.04_min-dependencies:${TAG}"]
 }
 
 target "gcc-compilers-base-20-04" {
diff --git a/util/dockerfiles/docker-compose.yaml b/util/dockerfiles/docker-compose.yaml
index 5c63e101ba..f461dbba51 100644
--- a/util/dockerfiles/docker-compose.yaml
+++ b/util/dockerfiles/docker-compose.yaml
@@ -6,140 +6,143 @@ services:
         build:
             context: gcn-gpu
             dockerfile: Dockerfile
-        image: gcr.io/gem5-test/gcn-gpu:latest
+        image: ghcr.io/gem5/gcn-gpu:latest
     gpu-fs:
         build:
             context: gpu-fs
             dockerfile: Dockerfile
-        image: gcr.io/gem5-test/gpu-fs:latest
+        image: ghcr.io/gem5/gpu-fs:latest
     sst:
         build:
             context: sst
             dockerfile: Dockerfile
-        image: gcr.io/gem5-test/sst-env:latest
+        image: ghcr.io/gem5/sst-env:latest
     systemc:
         build:
             context: systemc-2.3.3
             dockerfile: Dockerfile
-        image: gcr.io/gem5-test/systemc-env:latest
+        image: ghcr.io/gem5/systemc-env:latest
     ubuntu-20.04_all-dependencies:
         build:
             context: ubuntu-20.04_all-dependencies
             dockerfile: Dockerfile
-        image: gcr.io/gem5-test/ubuntu-20.04_all-dependencies:latest
+        image: ghcr.io/gem5/ubuntu-20.04_all-dependencies:latest
     ubuntu-22.04_all-dependencies:
         build:
             context: ubuntu-22.04_all-dependencies
             dockerfile: Dockerfile
-        image: gcr.io/gem5-test/ubuntu-22.04_all-dependencies:latest
-    ubuntu-22.04_min-dependencies:
+        image: ghcr.io/gem5/ubuntu-22.04_all-dependencies:latest
+    ubuntu-24.04_all-dependencies:
         build:
-            context: ubuntu-22.04_min-dependencies
+            context: ubuntu-24.04_all-dependencies
             dockerfile: Dockerfile
-        image: gcr.io/gem5-test/ubuntu-22.04_min-dependencies:latest
-    gcc-8:
+        image: ghcr.io/gem5/ubuntu-24.04_all-dependencies:latest
+    ubuntu-24.04_min-dependencies:
         build:
-            context: ubuntu-20.04_gcc-version
+            context: ubuntu-24.04_min-dependencies
             dockerfile: Dockerfile
-            args:
-                - version=8
-        image: gcr.io/gem5-test/gcc-version-8:latest
+        image: ghcr.io/gem5/ubuntu-24.04_min-dependencies:latest
     gcc-10:
         build:
             context: ubuntu-20.04_gcc-version
             dockerfile: Dockerfile
             args:
                 - version=10
-        image: gcr.io/gem5-test/gcc-version-10:latest
+        image: ghcr.io/gem5/gcc-version-10:latest
     gcc-11:
         build:
             context: ubuntu-22.04_gcc-version
             dockerfile: Dockerfile
             args:
                 - version=11
-        image: gcr.io/gem5-test/gcc-version-11:latest
+        image: ghcr.io/gem5/gcc-version-11:latest
     gcc-12:
         build:
             context: ubuntu-22.04_gcc-version
             dockerfile: Dockerfile
             args:
                 - version=12
-        image: gcr.io/gem5-test/gcc-version-12:latest
+        image: ghcr.io/gem5/gcc-version-12:latest
+    gcc-13:
+        build:
+            context: ubuntu-22.04_gcc_13-version
+            dockerfile: Dockerfile
+        image: ghcr.io/gem5/gcc-version-13:latest
     clang-7:
         build:
             context: ubuntu-20.04_clang-version
             dockerfile: Dockerfile
             args:
                 - version=7
-        image: gcr.io/gem5-test/clang-version-7:latest
+        image: ghcr.io/gem5/clang-version-7:latest
     clang-8:
         build:
             context: ubuntu-20.04_clang-version
             dockerfile: Dockerfile
             args:
                 - version=8
-        image: gcr.io/gem5-test/clang-version-8:latest
+        image: ghcr.io/gem5/clang-version-8:latest
     clang-9:
         build:
             context: ubuntu-20.04_clang-version
             dockerfile: Dockerfile
             args:
                 - version=9
-        image: gcr.io/gem5-test/clang-version-9:latest
+        image: ghcr.io/gem5/clang-version-9:latest
     clang-10:
         build:
             context: ubuntu-20.04_clang-version
             dockerfile: Dockerfile
             args:
                 - version=10
-        image: gcr.io/gem5-test/clang-version-10:latest
+        image: ghcr.io/gem5/clang-version-10:latest
     clang-11:
         build:
             context: ubuntu-20.04_clang-version
             dockerfile: Dockerfile
             args:
                 - version=11
-        image: gcr.io/gem5-test/clang-version-11:latest
+        image: ghcr.io/gem5/clang-version-11:latest
     clang-12:
         build:
             context: ubuntu-20.04_clang-version
             dockerfile: Dockerfile
             args:
                 - version=12
-        image: gcr.io/gem5-test/clang-version-12:latest
+        image: ghcr.io/gem5/clang-version-12:latest
     clang-13:
         build:
             context: ubuntu-22.04_clang-version
             dockerfile: Dockerfile
             args:
                 - version=13
-        image: gcr.io/gem5-test/clang-version-13:latest
+        image: ghcr.io/gem5/clang-version-13:latest
     clang-14:
         build:
             context: ubuntu-22.04_clang-version
             dockerfile: Dockerfile
             args:
                 - version=14
-        image: gcr.io/gem5-test/clang-version-14:latest
+        image: ghcr.io/gem5/clang-version-14:latest
     clang-15:
         build:
             context: ubuntu-22.04_clang-version
             dockerfile: Dockerfile
             args:
                 - version=15
-        image: gcr.io/gem5-test/clang-version-15:latest
+        image: ghcr.io/gem5/clang-version-15:latest
     clang-16:
         build:
             context: ubuntu-22.04_clang-16
             dockerfile: Dockerfile
-        image: gcr.io/gem5-test/clang-version-16:latest
+        image: ghcr.io/gem5/clang-version-16:latest
     llvm-gnu-cross-compiler-riscv64:
         build:
             context: llvm-gnu-cross-compiler-riscv64
             dockerfile: Dockerfile
-        image: gcr.io/gem5-test/llvm-gnu-cross-compiler-riscv64:latest
+        image: ghcr.io/gem5/llvm-gnu-cross-compiler-riscv64:latest
     gem5-all-min-dependencies:
         build:
             context: gem5-all-min-dependencies
             dockerfile: Dockerfile
-        image: gcr.io/gem5-test/gem5-all-min-dependencies:latest
+        image: ghcr.io/gem5/gem5-all-min-dependencies:latest
diff --git a/util/dockerfiles/gcn-gpu/Dockerfile b/util/dockerfiles/gcn-gpu/Dockerfile
index 9aead08c43..5aac3c28d8 100644
--- a/util/dockerfiles/gcn-gpu/Dockerfile
+++ b/util/dockerfiles/gcn-gpu/Dockerfile
@@ -23,109 +23,160 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 FROM --platform=${BUILDPLATFORM} ubuntu:20.04
 ENV DEBIAN_FRONTEND=noninteractive
-RUN apt -y update && apt -y upgrade && \
-    apt -y install build-essential git m4 scons zlib1g zlib1g-dev \
+
+RUN apt -y update && \
+    apt -y install \
+    # General gem5 requirements when building/running in Ubuntu 20.04, also
+    # some utilities needed to compile and obtain the right packages that
+    # make up this Docker image.
+    build-essential git m4 scons zlib1g zlib1g-dev \
     libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \
     python3-dev python-is-python3 doxygen libboost-all-dev \
-    libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config gdb
+    libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config gdb \
+    cmake wget gnupg2 rpm \
+    # Requirements for ROCm
+    mesa-common-dev libgflags-dev libgoogle-glog-dev \
+    # ROCm dependencies and requirements to get ROCm repo and build packages.
+    # Upgrade to gcc 10: gem5 supports gcc 10+.
+    gcc-10 g++-10 cpp-10
 
-# Requirements for ROCm
-RUN apt -y install cmake mesa-common-dev libgflags-dev libgoogle-glog-dev
-
-# Needed to get ROCm repo, build packages
-RUN apt -y install wget gnupg2 rpm
+# Update the default compiler to gcc 10
+RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \
+    update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++-10 100 && \
+    update-alternatives --install /usr/bin/cc cc /usr/bin/gcc-10 100
 
+# Add the ROCm apt key.
 RUN wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
 
 # ROCm webpage says to use debian main, but the individual versions
-# only have xenial
-RUN echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.0.1/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
+# only have xenial. We set the APT source here to obtain thecorrect
+# ROCm packages.
+RUN echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.0.1/ xenial main' | \
+    tee /etc/apt/sources.list.d/rocm.list
 
-RUN apt-get update && apt -y install hsakmt-roct hsakmt-roct-dev
+# Install/apt/4.0.1 packages.
+RUN apt -y update && apt -y install hsakmt-roct hsakmt-roct-dev
+
+# Create symbolic link: /opt/rocm -> /opt/rocm-4.0.1.
 RUN ln -s /opt/rocm-4.0.1 /opt/rocm
 
-RUN git clone -b rocm-4.0.0 https://github.com/RadeonOpenCompute/ROCR-Runtime.git && \
-    mkdir -p /ROCR-Runtime/src/build
+# Download, build, and install the ROCR-Runtime.
+# Sources and build files deleted after install to reduce the size of this
+# layer.
+RUN git clone -b rocm-4.0.0 --depth=1 \
+        https://github.com/RadeonOpenCompute/ROCR-Runtime.git && \
+    mkdir -p /ROCR-Runtime/src/build && \
+    cd /ROCR-Runtime/src/build && \
+    # We need MEMFD_CREATE=OFF as MEMFD_CREATE syscall isn't implemented.
+    cmake -DIMAGE_SUPPORT=OFF -DHAVE_MEMFD_CREATE=OFF -DCMAKE_BUILD_TYPE=Release .. && \
+    make -j$(nproc) && \
+    make package && \
+    apt -y install ./hsa-rocr-dev*.deb && \
+    # Cleanup
+    cd / && \
+    rm -rf /ROCR-Runtime
 
-WORKDIR /ROCR-Runtime/src/build
-# need MEMFD_CREATE=OFF as MEMFD_CREATE syscall isn't implemented
-RUN cmake -DIMAGE_SUPPORT=OFF -DHAVE_MEMFD_CREATE=OFF \
-    -DCMAKE_BUILD_TYPE=Release .. && make -j$(nproc) && make package
-RUN apt -y install ./hsa-rocr-dev*.deb
-WORKDIR /
+# This giant run encapsulates the cloning, building, installation of HIP,
+# ROCm-OpenCL and ROCclr. These all depend upon one another's sources and build
+# files, ergo they are all pulled, built, installed in a single RUN command.
+# Once all three are installed their sources and build files are deleted.
+RUN git clone -b rocm-4.0.0 --depth=1 \
+        https://github.com/ROCm-Developer-Tools/HIP.git && \
+    git clone -b rocm-4.0.0 --depth=1 \
+        https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git && \
+    git clone -b rocm-4.0.0 --depth=1 \
+        https://github.com/ROCm-Developer-Tools/ROCclr.git && \
+    # Get ROCclr dependencies
+    apt -y update && \
+    apt -y install llvm-amdgpu libncurses5 libtinfo-dev rocm-device-libs \
+                   comgr && \
+    mkdir -p ROCclr/build && \
+    cd ROCclr && \
+    # The patch allows us to avoid building blit kernels on-the-fly in gem5
+    wget -q -O - dist.gem5.org/dist/develop/rocm_patches/ROCclr.patch | \
+        git apply -v && \
+    # Build and install ROCclr.
+    cd /ROCclr/build && \
+    cmake \
+        -DOPENCL_DIR="/ROCm-OpenCL-Runtime" \
+        -DCMAKE_BUILD_TYPE=Release ..  && \
+    make -j$(nproc) && \
+    make install && \
+    # We apply a patch to avoid a linking error -- "multiple definition of
+    # 'ret_val'".
+    # Issue here: https://github.com/ROCm/ROCm-OpenCL-Runtime/issues/113
+    cd /ROCm-OpenCL-Runtime/khronos/icd && \
+    wget -q -O - https://github.com/KhronosGroup/OpenCL-ICD-Loader/pull/101/commits/319ba95eb08aa7c622efae50cb62c7cd7de14c1b.patch | \
+        patch -p1 && \
+    # Build and install ROCm OpenCL.
+    cd /ROCm-OpenCL-Runtime && \
+    mkdir build && \
+    cd build && \
+    cmake \
+        -DUSE_COMGR_LIBRARY=ON \
+        -DCMAKE_PREFIX_PATH="/opt/rocm" \
+        -DCMAKE_BUILD_TYPE=Release .. && \
+    make -j$(nproc) && \
+    make package && \
+    apt -y install ./rocm-opencl-2.0.0-amd64.deb \
+                   ./rocm-opencl-dev-2.0.0-amd64.deb && \
+    # Build and install HIP.
+    mkdir -p /HIP/build && \
+    cd /HIP/build && \
+    cmake \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DHSA_PATH=/usr/hsa \
+        -DHIP_COMPILER=clang \
+        -DHIP_PLATFORM=rocclr \
+        -DCMAKE_PREFIX_PATH="/opt/rocm" .. && \
+    make -j$(nproc) && \
+    make package && \
+    apt -y install ./hip-base*.deb ./hip-rocclr*.deb && \
+    cp -r /HIP/build/rocclr/CMakeFiles/Export/_opt/rocm/hip/lib/cmake/hip/* \
+          /opt/rocm/hip/lib/cmake/hip/ && \
+    # Cleanup.
+    cd / && \
+    rm -rf /HIP /ROCclr /ROCm-OpenCL-Runtime
 
-# Dependencies for ROCclr
-RUN apt -y install llvm-amdgpu libncurses5 libtinfo-dev rocm-device-libs comgr
 
-RUN git clone -b rocm-4.0.0 \
-    https://github.com/ROCm-Developer-Tools/ROCclr.git && \
-    mkdir -p ROCclr/build
-
-RUN git clone -b rocm-4.0.0 \
-    https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git && \
-    mkdir -p ROCm-OpenCL-Runtime/build
-
-WORKDIR /ROCclr
-# The patch allows us to avoid building blit kernels on-the-fly in gem5
-RUN wget -q -O - dist.gem5.org/dist/develop/rocm_patches/ROCclr.patch | git apply -v
-
-WORKDIR /ROCclr/build
-RUN cmake -DOPENCL_DIR="/ROCm-OpenCL-Runtime" \
-    -DCMAKE_BUILD_TYPE=Release .. && \
-    make -j$(nproc) && make install
-WORKDIR /
-
-WORKDIR ROCm-OpenCL-Runtime/build
-RUN cmake -DUSE_COMGR_LIBRARY=ON -DCMAKE_PREFIX_PATH="/opt/rocm" \
-    -DCMAKE_BUILD_TYPE=Release .. && \
-    make -j$(nproc) && make package
-RUN apt -y install ./rocm-opencl-2.0.0-amd64.deb ./rocm-opencl-dev-2.0.0-amd64.deb
-WORKDIR /
-
-RUN git clone -b rocm-4.0.0 \
-    https://github.com/ROCm-Developer-Tools/HIP.git && mkdir -p HIP/build
-
-WORKDIR HIP/build
-RUN cmake -DCMAKE_BUILD_TYPE=Release -DHSA_PATH=/usr/hsa \
-    -DHIP_COMPILER=clang -DHIP_PLATFORM=rocclr -DCMAKE_PREFIX_PATH="/opt/rocm"\
-    .. && make -j$(nproc) && make package
-RUN apt -y install ./hip-base*.deb ./hip-rocclr*.deb
-# These files here are needed but don't get installed through the .deb file,
-# even though they seem to be included in the packaging, so symlink them
-RUN ln -s /HIP/build/rocclr/CMakeFiles/Export/_opt/rocm/hip/lib/cmake/hip/* /opt/rocm/hip/lib/cmake/hip/
-WORKDIR /
-
-# rocBLAS downloads the most recent rocm-cmake if it isn't installed before
-# building
-RUN apt install rocm-cmake
-
-RUN git clone -b rocm-4.0.0 \
-    https://github.com/ROCmSoftwarePlatform/rocBLAS.git && mkdir rocBLAS/build
-
-ENV HCC_AMDGPU_TARGET=gfx801,gfx803,gfx900,gfx902
-WORKDIR rocBLAS
+# Clone, build, and install rocBLAS.
+# Sources and build files are deleted after the install.
 # rocBLAS needs to be built from source otherwise certain gfx versions get errors in HIP
 # about there being no GPU binary available
-RUN ./install.sh -d -i
-WORKDIR /
+ENV HCC_AMDGPU_TARGET=gfx900,gfx902
+RUN git clone -b rocm-4.0.0 --depth=1 \
+        https://github.com/ROCmSoftwarePlatform/rocBLAS.git && \
+    # rocBLAS downloads the most recent rocm-cmake if it isn't installed before
+    # building
+    apt -y update && \
+    apt -y install rocm-cmake && \
+    mkdir rocBLAS/build && \
+    cd rocBLAS && \
+    ./install.sh -d -i && \
+    cd / && \
+    rm -rf rocBLAS
 
 # MIOpen dependencies + MIOpen
-RUN apt install rocm-clang-ocl miopen-hip
+RUN apt -y update && \
+    apt -y install rocm-clang-ocl miopen-hip
 
 # Clone MIOpen repo so that we have the kernel sources available
-RUN git clone -b rocm-4.0.1 https://github.com/ROCmSoftwarePlatform/MIOpen.git
+RUN git clone -b rocm-4.0.1 --depth=1 \
+    https://github.com/ROCmSoftwarePlatform/MIOpen.git
 
 # Make the MIOpen cache dir ahead of time and symlink for easier access
 # when linking in the database file
-RUN mkdir -p /root/.cache/miopen/2.9.0.8252-rocm-rel-4.0-26-64506314 && \
-    ln -s /root/.cache/miopen/2.9.0.8252-rocm-rel-4.0-26-64506314 /root/.cache/miopen/2.9.0
+RUN mkdir -p /root/.cache/miopen/2.9.0.8252-rocm-rel-4.0-26-64506314
+RUN ln -s /root/.cache/miopen/2.9.0.8252-rocm-rel-4.0-26-64506314 \
+        /root/.cache/miopen/2.9.0
 
 # Add commands from halofinder Dockerfile
-RUN apt-get update && apt-get -y install libopenmpi-dev libomp-dev
-
-ENV HCC_AMDGPU_TARGET="gfx801,gfx803,gfx900"
+RUN apt -y update && \
+    apt -y install libopenmpi-dev libomp-dev
 
 ENV HIPCC_BIN=/opt/rocm/bin
 ENV MPI_INCLUDE=/usr/lib/x86_64-linux-gnu/openmpi/include
diff --git a/util/dockerfiles/gcn-gpu/README.md b/util/dockerfiles/gcn-gpu/README.md
index 0764cad415..2c5a21e7df 100644
--- a/util/dockerfiles/gcn-gpu/README.md
+++ b/util/dockerfiles/gcn-gpu/README.md
@@ -1,5 +1,5 @@
-## gcn3-gpu dockerfile
-This dockerfile contains all the dependences necessary to run GPU applications in gem5 using the gcn3 APU model
+## gcn-gpu dockerfile
+This dockerfile contains all the dependences necessary to run GPU applications in gem5 using the gcn5 (Vega) APU model
 
 ### Building the image
 ```
@@ -9,14 +9,14 @@ docker build -t <image_name> .
 ### Building gem5 using the image
 The following command assumes the gem5 directory is a subdirectory of your current directory
 ```
-docker run --rm -v $PWD/gem5:/gem5 -w /gem5 <image_name> scons -sQ -j$(nproc) build/GCN3_X86/gem5.opt
+docker run --rm -v $PWD/gem5:/gem5 -w /gem5 <image_name> scons -sQ -j$(nproc) build/VEGA_X86/gem5.opt
 ```
 
 ### Test gem5 using a prebuilt application
 ```
 wget http://dist.gem5.org/dist/current/test-progs/hip_sample_bins/MatrixTranspose
 docker run --rm -v $PWD/MatrixTranspose:/MatrixTranspose -v $PWD/public_gem5:/gem5 -w /gem5 \
-        <image_name> build/GCN3_X86/gem5.opt configs/example/apu_se.py -n2 --benchmark-root=/ -cMatrixTranspose
+        <image_name> build/VEGA_X86/gem5.opt configs/example/apu_se.py -n2 --benchmark-root=/ -cMatrixTranspose
 ```
 
 ### Notes
diff --git a/util/dockerfiles/gpu-fs/Dockerfile b/util/dockerfiles/gpu-fs/Dockerfile
index 55f4de4aff..a99b3369c3 100644
--- a/util/dockerfiles/gpu-fs/Dockerfile
+++ b/util/dockerfiles/gpu-fs/Dockerfile
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 Advanced Micro Devices, Inc.
+# Copyright (c) 2024 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,29 +27,38 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
-FROM --platform=${BUILDPLATFORM} ubuntu:20.04
+FROM --platform=${BUILDPLATFORM} ubuntu:22.04
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt -y update && apt -y upgrade && \
     apt -y install build-essential git m4 scons zlib1g zlib1g-dev \
     libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \
     python3-dev python-is-python3 doxygen libboost-all-dev \
     libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config
+RUN apt -y install wget
 
-# Requirements for ROCm
-RUN apt -y install cmake mesa-common-dev libgflags-dev libgoogle-glog-dev
+# The follow were adapted from instructions at the URL below, accessed 4/12/24:
+# https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/
+#             native-install/ubuntu.html
 
-# Needed to get ROCm repo, build packages
-RUN apt -y install wget gnupg2 rpm
+# Make the directory if it doesn't exist yet.
+# This location is recommended by the distribution maintainers.
+RUN mkdir --parents --mode=0755 /etc/apt/keyrings
 
-# Get the radeon gpg key for apt repository
-RUN wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
+# Download the key, convert the signing-key to a full
+# keyring required by apt and store in the keyring directory
+RUN wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
+    gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null
 
-# Modify apt sources to pull from ROCm 5.4.2 repository only
-RUN echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/5.4.2/ ubuntu main' | tee /etc/apt/sources.list.d/rocm.list
+RUN echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.1/ubuntu jammy main" \
+    | tee /etc/apt/sources.list.d/amdgpu.list
+RUN echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.1 jammy main" \
+    | tee --append /etc/apt/sources.list.d/rocm.list
+# Note: Need to remove 'echo -e' in docker.
+RUN echo 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
+    | tee /etc/apt/preferences.d/rocm-pin-600
 
-RUN apt-get update
-RUN apt -y install libnuma-dev
+RUN apt update
 
-# Install the ROCm-dkms source
-RUN apt -y install initramfs-tools
+# This package should be the minimum needed to build and is significantly
+# smaller than the full `rocm` package.
 RUN apt -y install rocm-dev
diff --git a/util/dockerfiles/gpu-fs/README.md b/util/dockerfiles/gpu-fs/README.md
new file mode 100644
index 0000000000..94904fa461
--- /dev/null
+++ b/util/dockerfiles/gpu-fs/README.md
@@ -0,0 +1,33 @@
+## rocm-build Dockerfile
+The Dockerfile in this directory is used to build applications to be run with GPU full system.
+Applications targeting AMD's ROCm GPU framework can be built using this docker (e.g., HIP, HSA, OpenCL, etc.).
+The current major ROCm version targeted is 6.1.
+This version matches the disk image provided in gem5-resources.
+
+The purpose of this docker image is to allow building applications without requiring ROCm to be installed on the host machine.
+If you have ROCm installed locally and the version matches the disk image of the simulated system, this docker is not required and you may build on the host normally.
+This docker is also not the disk image used to simulated GPU full system applications (i.e., not an input to gem5 itself).
+
+### Building the docker image
+```sh
+docker build -t <image_name> .
+```
+
+For example:
+
+```sh
+docker build -t rocm6-build .
+```
+
+### Building an application
+Building an application requires that docker run in a directory which has access to all files needed to build.
+The simplest example would be `square` in the `gem5-resources` repository.
+Square provides a Makefile and has only one input file:
+
+```sh
+cd gem5-resources/src/gpu/square
+docker run --rm -u $UID:$GID -v $PWD:$PWD -w $PWD rocm6-build make
+```
+
+More complex applications, such as applications requiring m5ops, applications with multiple build steps, or paths with symlinks require more complex --volume command line options.
+See the docker documentation to figure out how to set the volumes to build your application.
diff --git a/util/dockerfiles/sst/Dockerfile b/util/dockerfiles/sst/Dockerfile
index 50027fe5b2..bf392bcd49 100644
--- a/util/dockerfiles/sst/Dockerfile
+++ b/util/dockerfiles/sst/Dockerfile
@@ -36,6 +36,17 @@ RUN apt -y update && apt -y upgrade && \
 
 ENV SST_CORE_HOME="/sst/"
 
+# Use GCC 10
+RUN apt update && apt -y install gcc-10 g++-10
+RUN update-alternatives --install \
+    /usr/bin/g++ g++ /usr/bin/g++-10 100
+RUN update-alternatives --install \
+    /usr/bin/gcc gcc /usr/bin/gcc-10 100
+RUN update-alternatives --install \
+    /usr/bin/c++ c++ /usr/bin/g++-10 100
+RUN update-alternatives --install \
+    /usr/bin/cc cc /usr/bin/gcc-10 100
+
 # SST Stuff
 RUN mkdir /sst
 
diff --git a/util/dockerfiles/ubuntu-22.04_gcc_13-version/Dockerfile b/util/dockerfiles/ubuntu-22.04_gcc_13-version/Dockerfile
new file mode 100644
index 0000000000..157cf4f9ba
--- /dev/null
+++ b/util/dockerfiles/ubuntu-22.04_gcc_13-version/Dockerfile
@@ -0,0 +1,53 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+FROM --platform=${BUILDPLATFORM} ubuntu:22.04
+
+# Valid version values:
+# 13
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt -y update && \
+    # software-properties-common is necessary to install
+    # to add PPA to the system
+    apt -y install software-properties-common && \
+    # Installing PPA is necessary to install gcc-13
+    # because it is not available in the default repositories
+    # for Ubuntu 22.04
+    add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
+    apt -y update && \
+    apt -y install git m4 scons zlib1g zlib1g-dev libprotobuf-dev \
+    protobuf-compiler libprotoc-dev libgoogle-perftools-dev python3-dev \
+    doxygen libboost-all-dev libhdf5-serial-dev python3-pydot libpng-dev \
+    gcc-13 g++-13 make
+
+RUN update-alternatives --install \
+    /usr/bin/g++ g++ /usr/bin/g++- 100
+RUN update-alternatives --install \
+    /usr/bin/gcc gcc /usr/bin/gcc-13 100
+RUN update-alternatives --install \
+    /usr/bin/c++ c++ /usr/bin/g++-13 100
+RUN update-alternatives --install \
+    /usr/bin/cc cc /usr/bin/gcc-13 100
diff --git a/util/dockerfiles/ubuntu-24.04_all-dependencies/Dockerfile b/util/dockerfiles/ubuntu-24.04_all-dependencies/Dockerfile
new file mode 100644
index 0000000000..fc157261ab
--- /dev/null
+++ b/util/dockerfiles/ubuntu-24.04_all-dependencies/Dockerfile
@@ -0,0 +1,66 @@
+# Copyright (c) 2024 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+FROM --platform=${BUILDPLATFORM} ubuntu:24.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt -y update && apt -y upgrade && apt -y install \
+    build-essential \
+    scons \
+    python3-dev \
+    git \
+    pre-commit \
+    zlib1g \
+    zlib1g-dev \
+    libprotobuf-dev \
+    protobuf-compiler \
+    libprotoc-dev \
+    libgoogle-perftools-dev \
+    libboost-all-dev \
+    libhdf5-serial-dev \
+    python3-pydot \
+    python3-venv \
+    python3-tk \
+    mypy \
+    m4 \
+    libcapstone-dev \
+    libpng-dev \
+    libelf-dev \
+    pkg-config \
+    wget \
+    cmake \
+    doxygen
+
+# pre-commit, as installed via apt in 24.04, attempts to create a cache
+# directory at "${HOME}/.cache/pre-commit". If running docker with non-root,
+# the HOME directory is set to "/". Since non-root users do not have permission
+# to write to this directory, an error is returned when pre-commit is executed.
+# pre-commit's default cache directory can be changed via the `XDG_CACHE_HOME`
+# enivoronment variable. Here we set it to "/tmp". With this pre-commit will
+# create a "/tmp/pre-commit" directory to use for caching. "/tmp" was chosen
+# as it's a directory any user can access and write to. Given this only stores
+# caching information, the "/tmp" directory being wiped is not a concern.
+ENV XDG_CACHE_HOME=/tmp/
diff --git a/util/dockerfiles/ubuntu-22.04_min-dependencies/Dockerfile b/util/dockerfiles/ubuntu-24.04_min-dependencies/Dockerfile
similarity index 93%
rename from util/dockerfiles/ubuntu-22.04_min-dependencies/Dockerfile
rename to util/dockerfiles/ubuntu-24.04_min-dependencies/Dockerfile
index 690959da91..cdac54a791 100644
--- a/util/dockerfiles/ubuntu-22.04_min-dependencies/Dockerfile
+++ b/util/dockerfiles/ubuntu-24.04_min-dependencies/Dockerfile
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 The Regents of the University of California
+# Copyright (c) 2024 The Regents of the University of California
 # All Rights Reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,7 +24,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-FROM --platform=${BUILDPLATFORM} ubuntu:22.04
+FROM --platform=${BUILDPLATFORM} ubuntu:24.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt -y update && apt -y upgrade && \
diff --git a/util/gem5-resources-manager/requirements.txt b/util/gem5-resources-manager/requirements.txt
index 5ffd51deb2..b973abe69d 100644
--- a/util/gem5-resources-manager/requirements.txt
+++ b/util/gem5-resources-manager/requirements.txt
@@ -1,29 +1,29 @@
 attrs==23.1.0
 blinker==1.6.2
-certifi==2023.5.7
+certifi==2023.7.22
 cffi==1.15.1
 charset-normalizer==3.1.0
 click==8.1.3
 colorama==0.4.6
 coverage==7.2.7
-cryptography==39.0.2
-dnspython==2.3.0
+cryptography==42.0.4
+dnspython==2.6.1
 Flask==2.3.2
-idna==3.4
+idna==3.7
 importlib-metadata==6.6.0
 itsdangerous==2.1.2
-Jinja2==3.1.2
+Jinja2==3.1.4
 jsonschema==4.17.3
 Markdown==3.4.3
 MarkupSafe==2.1.3
 mongomock==4.1.2
 packaging==23.1
 pycparser==2.21
-pymongo==4.3.3
+pymongo==4.6.3
 pyrsistent==0.19.3
 python-dotenv==1.0.0
-requests==2.31.0
+requests==2.32.0
 sentinels==1.0.0
-urllib3==2.0.2
-Werkzeug==2.3.4
+urllib3==2.2.2
+Werkzeug==3.0.3
 zipp==3.15.0
diff --git a/util/gem5img.py b/util/gem5img.py
index 4a8c0916fe..4223316b29 100755
--- a/util/gem5img.py
+++ b/util/gem5img.py
@@ -194,7 +194,8 @@ def findPartOffset(devFile, fileName, partition):
         r"\s*:\s*"  # Separator
         r"start=\s*(?P<start>\d+),\s*"  # Partition start record
         r"size=\s*(?P<size>\d+),\s*"  # Partition size record
-        r"type=(?P<type>\d+)"  # Partition type record
+        r"type=(?P<type>.+?),*?"  # Partition type record
+        r".*"  # anything else, e.g., name field
         r"\s*$"  # End of line
     )
     lines = out.splitlines()
diff --git a/util/github-runners-vagrant/provision_root.sh b/util/github-runners-vagrant/provision_root.sh
index d3e6bb574c..dda1950690 100644
--- a/util/github-runners-vagrant/provision_root.sh
+++ b/util/github-runners-vagrant/provision_root.sh
@@ -31,7 +31,6 @@ set -eu -o pipefail # -x: is for debugging
 
 apt-get update
 apt-get upgrade -y
-add-apt-repository --yes --update ppa:git-core/ppa
 apt-get install -y \
   software-properties-common \
   bash \
diff --git a/util/helper_scripts_for_mongodb/README.md b/util/helper_scripts_for_mongodb/README.md
new file mode 100644
index 0000000000..e252d6a427
--- /dev/null
+++ b/util/helper_scripts_for_mongodb/README.md
@@ -0,0 +1,51 @@
+# Overview
+
+This utility contains various scripts that are helpful when maintaining the gem5 resources MongoDB database.
+The scripts in this directory use external libraries. Please install the required libraries mentioned in the
+`requirements.txt` by running the following command:
+
+```
+ pip3 install -r requirements.txt
+```
+
+## Adding JSON file to MongoDB
+
+We can use the following command to add a JSON file to MongoDB collection:
+
+```
+mongoimport --db <dbName> --collection <collectionName> --file <fileName.json> --jsonArray
+```
+
+## backup MongoDB
+
+We can mongodump to take a back of a mongoDB databse. We can use the following command:
+
+```
+mongodump --uri <uri>
+```
+
+## create-new-collection.py
+
+This script grabs all documents from a specified collection in a MongoDB database and creates a new collection with the same documents.
+
+To run this script you use the following command:
+
+```
+ python3 ./create-new-collection.py --uri <uri> --db_name <db_name> --collection_name <collection_name> --new_collection_name <new_collection_name>
+```
+
+## update-gem5-versions.py
+
+This script grabs all resources categorically from a specified collection in a MongoDB
+database and adds a new gem5 version to the gem5_versions field of each
+resource.
+
+To run this script you use the following command:
+
+```
+python3 ./update-gem5-versions.py --uri <uri> --db <db_name> --collection <collection_name> --version <version> --category <category> --outfile <outfile>
+```
+
+## helper.py
+
+ This script  contains helper functions for the scripts in this directory.
diff --git a/util/helper_scripts_for_mongodb/create-new-collection.py b/util/helper_scripts_for_mongodb/create-new-collection.py
new file mode 100644
index 0000000000..fd1d8d35ca
--- /dev/null
+++ b/util/helper_scripts_for_mongodb/create-new-collection.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+# Copyright (c) 2023 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+ This script grabs all documents from a specified collection in a MongoDB
+ database and creates a new collection with the same documents.
+
+ To run this script you use the following command:
+ python3 ./create-new-collection.py --uri <uri> --db_name <db_name> --collection_name <collection_name> /
+    --new_collection_name <new_collection_name>
+"""
+import argparse
+import json
+
+from bson import json_util
+from helper import get_database
+
+parser = argparse.ArgumentParser(
+    description="Get all documents from a "
+    "specified collection in a MongoDB database "
+    "and create a new collection with the same "
+    "documents."
+)
+parser.add_argument(
+    "--uri", help="URI of the database", type=str, required=True
+)
+parser.add_argument(
+    "--new_collection_name",
+    help="Name of the new collection",
+    type=str,
+    required=True,
+)
+parser.add_argument(
+    "--db_name", help="Name of the database", type=str, default="gem5-vision"
+)
+parser.add_argument(
+    "--collection_name",
+    help="Name of the collection",
+    type=str,
+    default="resources",
+)
+
+args = parser.parse_args()
+
+uri = args.uri
+db_name = args.db_name
+collection_name = args.collection_name
+new_collection_name = args.new_collection_name
+
+
+collection = get_database(uri, db_name, collection_name)
+
+new_collection = get_database(uri, db_name, new_collection_name)
+
+# get all documents from collection
+# Suppress _id field from being returned
+# as new collection will have its own _id field
+resources = collection.find({}, {"_id": 0})
+
+# convert to json
+resources = json.loads(json_util.dumps(resources))
+
+new_collection.insert_many(resources)
diff --git a/util/helper_scripts_for_mongodb/helper.py b/util/helper_scripts_for_mongodb/helper.py
new file mode 100644
index 0000000000..d3084f62f9
--- /dev/null
+++ b/util/helper_scripts_for_mongodb/helper.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# Copyright (c) 2023 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+ This script  contains helper functions for the scripts in this directory.
+ This script requires external packages, please install them by running:
+    pip3 install -r requirements.txt
+"""
+
+import json
+
+from bson import json_util
+from pymongo import MongoClient
+
+
+def get_database(uri, db_name, collection_name):
+    client = MongoClient(uri)
+    db = client[db_name]
+    collection = db[collection_name]
+    return collection
+
+
+def save_to_json(resources, json_file_name):
+    with open(json_file_name, "w") as outfile:
+        json.dump(json.loads(json_util.dumps(resources)), outfile, indent=4)
diff --git a/util/helper_scripts_for_mongodb/requirements.txt b/util/helper_scripts_for_mongodb/requirements.txt
new file mode 100644
index 0000000000..3ff5d44c60
--- /dev/null
+++ b/util/helper_scripts_for_mongodb/requirements.txt
@@ -0,0 +1,3 @@
+```
+bson == 0.5.10
+pymongo == 4.3.3
diff --git a/util/helper_scripts_for_mongodb/update-gem5-versions.py b/util/helper_scripts_for_mongodb/update-gem5-versions.py
new file mode 100644
index 0000000000..b83c4dcf5d
--- /dev/null
+++ b/util/helper_scripts_for_mongodb/update-gem5-versions.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# Copyright (c) 2023 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+ This script grabs all resources categorically from a specified collection in a MongoDB
+ database and adds a new gem5 version to the gem5_versions field of each
+ resource.
+
+ To run this script you use the following command:
+ python3 ./update-gem5-versions.py --uri <uri> --db <db_name> /
+    --collection <collection_name> --version <version> /
+    --category <category> --outfile <outfile>
+"""
+import argparse
+import json
+
+from bson import json_util
+from helper import (
+    get_database,
+    save_to_json,
+)
+
+parser = argparse.ArgumentParser(
+    description="Get all resources from a "
+    "specified collection in a MongoDB database "
+    "and add a new gem5 version to the "
+    "gem5_versions field of each resource."
+)
+parser.add_argument(
+    "--uri", help="URI of the database", type=str, required=True
+)
+parser.add_argument(
+    "--version",
+    help="Version to add to gem5_versions",
+    type=str,
+    required=True,
+)
+parser.add_argument(
+    "--db", help="Name of the database", type=str, default="gem5-vision"
+)
+parser.add_argument(
+    "--collection",
+    help="Name of the collection",
+    type=str,
+    default="resources",
+)
+parser.add_argument(
+    "--category", help="Category to not update", action="append"
+)
+parser.add_argument(
+    "--outfile",
+    help="Name of the output json file",
+    type=str,
+    default="resources_update_gem5_versions.json",
+)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # if no category is excluded, set category to empty list
+    if args.category is None:
+        args.category = []
+
+    collection = get_database(args.uri, args.db, args.collection)
+
+    # get all documents from resources collection
+    resources = collection.find({}, {"_id": 0})
+    resources = json.loads(json_util.dumps(resources))
+
+    for resource in resources:
+        if resource["category"] in args.category:
+            continue
+        if args.version in resource["gem5_versions"]:
+            continue
+        resource["gem5_versions"].append(args.version)
+
+    save_to_json(resources, args.outfile)
diff --git a/util/m5/src/abi/arm64/SConsopts b/util/m5/src/abi/arm64/SConsopts
index ebf40c5fe2..5e5f16e722 100644
--- a/util/m5/src/abi/arm64/SConsopts
+++ b/util/m5/src/abi/arm64/SConsopts
@@ -32,3 +32,6 @@ get_abi_opt('QEMU_ARCH', 'aarch64')
 env['CALL_TYPE']['inst'].impl('m5op.S', 'verify_inst.cc', default=True)
 env['CALL_TYPE']['addr'].impl('m5op_addr.S')
 env['CALL_TYPE']['semi'].impl('m5op_semi.S', 'verify_semi.cc')
+
+env.Append(CXXFLAGS='-DM5OP_ADDR=0x10010000')
+env.Append(CCFLAGS='-DM5OP_ADDR=0x10010000')
diff --git a/util/m5/src/abi/x86/m5op.S b/util/m5/src/abi/x86/m5op.S
index 9a425b6c26..4114e3aee7 100644
--- a/util/m5/src/abi/x86/m5op.S
+++ b/util/m5/src/abi/x86/m5op.S
@@ -43,3 +43,9 @@
 #define M5OP(name, func) m5op_func name, func;
         M5OP_FOREACH
 #undef M5OP
+
+/* The line below is added to remove the warning
+* saying that the file is not specifiying GNU non
+* executable stack in ubuntu 24.04
+*/
+.section .note.GNU-stack
diff --git a/util/m5/src/abi/x86/m5op_addr.S b/util/m5/src/abi/x86/m5op_addr.S
index d6e6cf5315..98069eb584 100644
--- a/util/m5/src/abi/x86/m5op_addr.S
+++ b/util/m5/src/abi/x86/m5op_addr.S
@@ -55,3 +55,9 @@
 #define M5OP(name, func) m5op_func M5OP_MERGE_TOKENS(name, _addr), func;
         M5OP_FOREACH
 #undef M5OP
+
+/* The line below is added to remove the warning
+* saying that the file is not specifiying GNU non
+* executable stack in ubuntu 24.04
+*/
+.section .note.GNU-stack
diff --git a/util/m5/src/m5_mmap.c b/util/m5/src/m5_mmap.c
index c088e264eb..cda36fe70a 100644
--- a/util/m5/src/m5_mmap.c
+++ b/util/m5/src/m5_mmap.c
@@ -69,6 +69,10 @@ map_m5_mem()
         exit(1);
     }
 
+    if (m5op_addr == 0) {
+        fprintf(stdout, "Warn: m5op_addr is set to 0x0\n");
+    }
+
     fd = open(m5_mmap_dev, O_RDWR | O_SYNC);
     if (fd == -1) {
         fprintf(stderr, "Can't open %s: %s\n", m5_mmap_dev, strerror(errno));
diff --git a/util/maint/list_changes.py b/util/maint/list_changes.py
index 805d7676b6..8c057381b9 100755
--- a/util/maint/list_changes.py
+++ b/util/maint/list_changes.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# Copyright (c) 2017-2018 Arm Limited
+# Copyright (c) 2017-2018, 2024 Arm Limited
 # All rights reserved
 #
 # The license below extends only to copyright in the software and shall
@@ -88,20 +88,20 @@ class Commit:
         return self._tags
 
     @property
-    def change_id(self):
-        """Get the Change-Id tag from the commit
+    def change_ids(self):
+        """Get the Change-Id tag(s) from the commit
 
-        Returns: A change ID or None if no change ID has been
-        specified.
+        Returns: A list of change IDs. May be empty if no change IDs
+        have been specified.
 
         """
+
         try:
             cids = self.tags["Change-Id"]
         except KeyError:
-            return None
+            return list()
 
-        assert len(cids) == 1
-        return cids[0]
+        return cids[:]
 
     def __str__(self):
         return f"{self.rev[0:8]}: {self.log[0]}"
@@ -137,32 +137,26 @@ def list_changes(upstream, feature, paths=[]):
     feature_revs = tuple(list_revs(upstream, feature, paths=paths))
     upstream_revs = tuple(list_revs(feature, upstream, paths=paths))
 
-    feature_cids = {
-        c.change_id: c for c in feature_revs if c.change_id is not None
-    }
-    upstream_cids = {
-        c.change_id: c for c in upstream_revs if c.change_id is not None
-    }
+    feature_cids = {cid: c for c in feature_revs for cid in c.change_ids}
+    upstream_cids = {cid: c for c in upstream_revs for cid in c.change_ids}
 
     incoming = [
         r
         for r in reversed(upstream_revs)
-        if r.change_id and r.change_id not in feature_cids
+        if any(rcid not in feature_cids for rcid in r.change_ids)
     ]
     outgoing = [
         r
         for r in reversed(feature_revs)
-        if r.change_id and r.change_id not in upstream_cids
+        if any(rcid not in upstream_cids for rcid in r.change_ids)
     ]
     common = [
-        r for r in reversed(feature_revs) if r.change_id in upstream_cids
-    ]
-    upstream_unknown = [
-        r for r in reversed(upstream_revs) if r.change_id is None
-    ]
-    feature_unknown = [
-        r for r in reversed(feature_revs) if r.change_id is None
+        r
+        for r in reversed(feature_revs)
+        if any(rcid in upstream_cids for rcid in r.change_ids)
     ]
+    upstream_unknown = [r for r in reversed(upstream_revs) if not r.change_ids]
+    feature_unknown = [r for r in reversed(feature_revs) if not r.change_ids]
 
     return incoming, outgoing, common, upstream_unknown, feature_unknown
 
@@ -252,12 +246,12 @@ def _main():
         print("Incorrectly rebased changes:")
         all_upstream_revs = list_revs(args.upstream, paths=args.paths)
         all_upstream_cids = {
-            c.change_id: c
-            for c in all_upstream_revs
-            if c.change_id is not None
+            cid: c for c in all_upstream_revs for cid in c.change_ids
         }
         incorrect_outgoing = [
-            r for r in outgoing if r.change_id in all_upstream_cids
+            r
+            for r in outgoing
+            if any(rcid in all_upstream_cids for rcid in r.change_ids)
         ]
         for rev in incorrect_outgoing:
             print(rev)
diff --git a/util/term/term.c b/util/term/term.c
index b1624b28be..98c62974d5 100644
--- a/util/term/term.c
+++ b/util/term/term.c
@@ -32,7 +32,11 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <libgen.h>
-#include <linux/limits.h>
+#ifdef __APPLE__
+    #include <sys/syslimits.h>
+#else
+    #include <linux/limits.h>
+#endif
 #include <netdb.h>
 #include <netinet/in.h>
 #include <poll.h>
diff --git a/util/tlm/SConstruct b/util/tlm/SConstruct
index 3f4abf8b08..25a1643018 100644
--- a/util/tlm/SConstruct
+++ b/util/tlm/SConstruct
@@ -44,6 +44,9 @@ env = Environment()
 
 #Make the gem5 root available in SConscripts
 env['GEM5_ROOT'] = gem5_root
+#This won't work with kconfig-based builds, where the build directory is
+# not necessarily named "build".
+env['GEM5BUILD'] = os.path.join(gem5_root, 'build')
 
 shlibsuffix = env['SHLIBSUFFIX']
 
@@ -52,6 +55,7 @@ env.Append(CPPPATH=[gem5_root + '/build/' + gem5_arch,
                     gem5_root + '/util/systemc/gem5_within_systemc',
                     gem5_root + '/ext/systemc/src',
                     gem5_root + '/ext',
+                    gem5_root + '/src',
                     '#src',
                     '#examples/common',
                     ])
@@ -64,6 +68,15 @@ env.Append(CXXFLAGS=['-std=c++17',
 if gem5_variant == 'debug':
     env.Append(CXXFLAGS=['-g', '-DGEM5_DEBUG'])
 
+# An option not to link source files in the build directory.
+# Not enabled by default.
+AddOption('--no-duplicate-sources', action='store_false', default=True,
+          dest='duplicate_sources',
+          help='Do not create symlinks to sources in the build directory')
+
+AddOption('--verbose', action='store_true',
+          help='Print full tool command lines')
+
 deps = [] # keep track of all dependencies required for building the binaries
 
 deps += SConscript('src/SConscript', variant_dir='build/tlm', exports='env',
@@ -80,12 +93,6 @@ sys.path.append(gem5_root + '/src/python')
 AddOption('--no-colors', dest='use_colors', action='store_false',
           help="Don't add color to abbreviated scons output")
 
-# An option not to link source files in the build directory.
-# Not enabled by default.
-AddOption('--no-duplicate-sources', action='store_false', default=True,
-          dest='duplicate_sources',
-          help='Do not create symlinks to sources in the build directory')
-
 env.SConsignFile('build/systemc/sconsign')
 SConscript(gem5_root + '/ext/systemc/SConscript',
            variant_dir='build/systemc',